[LWN Logo]

Date:	Fri, 31 Mar 2000 13:34:28 -0800 (PST)
From:	Eduardo Horvath <eeh@turbolinux.com>
To:	linux-kernel@vger.rutgers.edu
Subject: [PATCH] Really disabling overcommit.


This patch will prevent the linux kernel from allowing VM overcommit.  It
was generated against a 2.3.99-pre3 kernel.

This is accomplished by reserving memory whenever a new COW or ZFOD 
mapping is created.  Each process also reserves space for a stack up to
the stack soft resource limit.  In order to waste less stack space the
beginning stack soft limit has been reduced to 2MB, which should be
sufficient for most applications.  Since resource limits are inherited,
setting appropriate limits early in the boot process can be used to affect
the entire system.  Individual processes need more stack they can increase
the soft limit up to the hard limit which is essentially unlimited, or
until the kernel is unable to reserve sufficient swap resources.

Available virtual memory is calculated as the sum of all swap space as
well as free and reclaimable RAM, essentially the same value as used
before.  The kernel will now operate in 4 different modes depending on the
value of sysctl_overcommit_memory:

1	Do accounting but do not prevent any allocations (old behavior)

0	Do accounting but only prevent individual allocations that exceed
 		total VM (old behavior)

-1	Do accounting and prevent a user from making the amount of
 		reserved memory exceed the total virtual memory.

-2	Same as above but also for root.

The default is set to -1 to allow root to essentially do whatever it
wants.  But then if someone's broken root you're in trouble anyway.

If the kernel itself requires memory it can allocate as much as it wants
and can bring the system into an unsafe state (reserved > total).

Memory segments that are not COW, ZFOD or otherwise swap backed do not
require reservation.

Operations that create new memory maps reserve stack space up to the
stack limit as well as enough to back the rest of the memory 
map.  clone(2) operations that share memory maps do not reserve
memory.  

This could certainly be improved upon, but it should at least give some
concrete numbers for discussing this OOM issue.

Proper handling of NORESERVE mappings will be in the next version.

Eduardo Horvath				   

Index: linux/fs/exec.c
diff -u linux/fs/exec.c:1.1.1.1 linux/fs/exec.c:1.2
--- linux/fs/exec.c:1.1.1.1	Fri Mar 24 15:21:18 2000
+++ linux/fs/exec.c	Wed Mar 29 14:59:52 2000
@@ -360,21 +360,27 @@
 static int exec_mmap(void)
 {
 	struct mm_struct * mm, * old_mm;
+	struct task_struct * tsk = current;
+	unsigned long reserved = 0;
 
-	old_mm = current->mm;
+	old_mm = tsk->mm;
 	if (old_mm && atomic_read(&old_mm->mm_users) == 1) {
+		/* Keep old stack reservation */
 		flush_cache_mm(old_mm);
 		mm_release();
 		exit_mmap(old_mm);
 		flush_tlb_mm(old_mm);
 		return 0;
 	}
-
+	reserved = vm_enough_memory(tsk->rlim[RLIMIT_STACK].rlim_cur >>
+		PAGE_SHIFT);
+	if (!reserved)
+		return -ENOMEM;
 	mm = mm_alloc();
 	if (mm) {
-		struct mm_struct *active_mm = current->active_mm;
+		struct mm_struct *active_mm = tsk->active_mm;
 
-		init_new_context(current, mm);
+		init_new_context(tsk, mm);
 		current->mm = mm;
 		current->active_mm = mm;
 		activate_mm(active_mm, mm);
@@ -387,6 +393,7 @@
 		mmdrop(active_mm);
 		return 0;
 	}
+	vm_release_memory(reserved);
 	return -ENOMEM;
 }
 
Index: linux/fs/proc/proc_misc.c
diff -u linux/fs/proc/proc_misc.c:1.1.1.1 linux/fs/proc/proc_misc.c:1.4
--- linux/fs/proc/proc_misc.c:1.1.1.1	Fri Mar 24 15:21:23 2000
+++ linux/fs/proc/proc_misc.c	Thu Mar 30 16:17:12 2000
@@ -165,7 +165,9 @@
                 "LowTotal:  %8lu kB\n"
                 "LowFree:   %8lu kB\n"
                 "SwapTotal: %8lu kB\n"
-                "SwapFree:  %8lu kB\n",
+                "SwapFree:  %8lu kB\n"
+                "VMTotal:   %8lu kB\n"
+                "VMReserved:%8lu kB\n",
                 K(i.totalram),
                 K(i.freeram),
                 K(i.sharedram),
@@ -176,7 +178,9 @@
                 K(i.totalram-i.totalhigh),
                 K(i.freeram-i.freehigh),
                 K(i.totalswap),
-                K(i.freeswap));
+                K(i.freeswap),
+                K(vm_total()), 
+                K(vm_reserved));
 
 	if (len <= off+count) *eof = 1;
 	*start = page + off;
Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.1 linux/include/linux/mm.h:1.4
--- linux/include/linux/mm.h:1.1.1.1	Fri Mar 24 15:21:39 2000
+++ linux/include/linux/mm.h	Thu Mar 30 15:01:21 2000
@@ -21,6 +21,13 @@
 #include <asm/atomic.h>
 
 /*
+ * These are used to prevent VM overcommit.
+ */
+extern unsigned long vm_reserved;
+extern spinlock_t vm_lock;
+extern inline unsigned long vm_total(void);
+
+/*
  * Linux kernel virtual memory manager primitives.
  * The idea being to have a "virtual" mm in the same way
  * we have a virtual fs - giving a cleaner interface to the
@@ -441,6 +448,14 @@
 extern unsigned long do_brk(unsigned long, unsigned long);
 
 struct zone_t;
+
+extern long vm_enough_memory(long pages);
+extern inline void vm_release_memory(long pages) {
+	int flags;
+	spin_lock_irqsave(&vm_lock, flags);
+	vm_reserved -= pages;
+	spin_unlock_irqrestore(&vm_lock, flags);
+}
 /* filemap.c */
 extern void remove_inode_page(struct page *);
 extern unsigned long page_unuse(struct page *);
Index: linux/include/linux/sched.h
diff -u linux/include/linux/sched.h:1.1.1.1 linux/include/linux/sched.h:1.2
--- linux/include/linux/sched.h:1.1.1.1	Fri Mar 24 15:21:39 2000
+++ linux/include/linux/sched.h	Wed Mar 29 15:00:04 2000
@@ -380,9 +380,9 @@
 
 /*
  * Limit the stack by to some sane default: root can always
- * increase this limit if needed..  8MB seems reasonable.
+ * increase this limit if needed..  2MB should be more than enough.
  */
-#define _STK_LIM	(8*1024*1024)
+#define _STK_LIM	(2*1024*1024)
 
 #define DEF_PRIORITY	(20*HZ/100)	/* 200 ms time slices */
 
Index: linux/ipc/shm.c
diff -u linux/ipc/shm.c:1.1.1.1 linux/ipc/shm.c:1.2
--- linux/ipc/shm.c:1.1.1.1	Fri Mar 24 15:23:02 2000
+++ linux/ipc/shm.c	Wed Mar 29 15:00:05 2000
@@ -1668,17 +1668,23 @@
 
 int map_zero_setup(struct vm_area_struct *vma)
 {
-	extern int vm_enough_memory(long pages);
+	extern long vm_enough_memory(long pages);
 	struct shmid_kernel *shp;
 	struct file *filp;
+	int error = -ENOMEM;
+	long reserved = 0;
 
-	if (!vm_enough_memory((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))
-		return -ENOMEM;
-	if (IS_ERR(shp = newseg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE, 0)))
-		return PTR_ERR(shp);
+	if (!(reserved = 
+		vm_enough_memory((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
+		goto err;
+	if (IS_ERR(shp = newseg_alloc((vma->vm_end - vma->vm_start) / 
+		PAGE_SIZE, 0))) {
+		error = PTR_ERR(shp);
+		goto err;
+	}
 	if ((filp = file_setup(vma->vm_file, shp)) == 0) {
 		killseg_core(shp, 0);
-		return -ENOMEM;
+		goto err;
 	}
 	vma->vm_file = filp;
 	VMA_TO_SHP(vma) = (void *)shp;
@@ -1690,6 +1696,9 @@
 	list_add(&shp->zero_list, &zshmid_kernel.zero_list);
 	spin_unlock(&zmap_list_lock);
 	return 0;
+err:
+	vm_release_memory(reserved);
+	return error;
 }
 
 static void shmzero_open(struct vm_area_struct *shmd)
Index: linux/kernel/exit.c
diff -u linux/kernel/exit.c:1.1.1.1 linux/kernel/exit.c:1.2
--- linux/kernel/exit.c:1.1.1.1	Fri Mar 24 15:21:38 2000
+++ linux/kernel/exit.c	Wed Mar 29 15:00:05 2000
@@ -286,6 +286,11 @@
 	if (mm) {
 		atomic_inc(&mm->mm_count);
 		mm_release();
+		if (atomic_read(&mm->mm_users) == 1) {
+		/* Only release stack if we're the last one using this mm */
+			vm_release_memory(tsk->rlim[RLIMIT_STACK].rlim_cur >>
+				PAGE_SHIFT);
+		}
 		if (mm != tsk->active_mm) BUG();
 		tsk->mm = NULL;
 		enter_lazy_tlb(mm, current, smp_processor_id());
Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.1.1.1 linux/kernel/fork.c:1.2
--- linux/kernel/fork.c:1.1.1.1	Fri Mar 24 15:21:38 2000
+++ linux/kernel/fork.c	Wed Mar 29 15:00:05 2000
@@ -231,6 +231,7 @@
 static inline int dup_mmap(struct mm_struct * mm)
 {
 	struct vm_area_struct * mpnt, *tmp, **pprev;
+	unsigned long reserved = 0;
 	int retval;
 
 	/* Kill me slowly. UGLY! FIXME! */
@@ -242,6 +243,16 @@
 		struct file *file;
 
 		retval = -ENOMEM;
+		reserved = 0;
+
+		if ((mpnt->vm_flags & (VM_GROWSDOWN | VM_WRITE | VM_SHARED)) ==
+			VM_WRITE) {
+			unsigned long npages = mpnt->vm_end - mpnt->vm_start;
+
+			reserved = vm_enough_memory(npages >> PAGE_SHIFT);
+			if (!reserved)
+				goto fail_nomem;
+		}
 		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 		if (!tmp)
 			goto fail_nomem;
@@ -365,6 +376,7 @@
 static inline int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct mm_struct * mm;
+	unsigned long reserved;
 	int retval;
 
 	tsk->min_flt = tsk->maj_flt = 0;
@@ -389,6 +401,11 @@
 	}
 
 	retval = -ENOMEM;
+	reserved = vm_enough_memory(tsk->rlim[RLIMIT_STACK].rlim_cur >>
+		PAGE_SHIFT);
+	if (!reserved)
+		goto fail_nomem;
+
 	mm = mm_alloc();
 	if (!mm)
 		goto fail_nomem;
@@ -416,6 +433,8 @@
 free_pt:
 	mmput(mm);
 fail_nomem:
+	if (reserved)
+		vm_release_memory(reserved);
 	return retval;
 }
 
Index: linux/kernel/sys.c
diff -u linux/kernel/sys.c:1.1.1.1 linux/kernel/sys.c:1.3
--- linux/kernel/sys.c:1.1.1.1	Fri Mar 24 15:21:38 2000
+++ linux/kernel/sys.c	Thu Mar 30 11:38:54 2000
@@ -952,6 +952,7 @@
 asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit *rlim)
 {
 	struct rlimit new_rlim, *old_rlim;
+	struct task_struct *tsk;
 
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
@@ -959,7 +960,8 @@
 		return -EFAULT;
 	if (new_rlim.rlim_cur < 0 || new_rlim.rlim_max < 0)
 		return -EINVAL;
-	old_rlim = current->rlim + resource;
+	tsk = current;
+	old_rlim = tsk->rlim + resource;
 	if (((new_rlim.rlim_cur > old_rlim->rlim_max) ||
 	     (new_rlim.rlim_max > old_rlim->rlim_max)) &&
 	    !capable(CAP_SYS_RESOURCE))
@@ -967,6 +969,17 @@
 	if (resource == RLIMIT_NOFILE) {
 		if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
 			return -EPERM;
+	}
+	/* if PF_VFORK is set we're just borrowing the VM so don't touch it */
+	if (resource == RLIMIT_STACK && !(tsk->flags & PF_VFORK)) {
+		long newpages =
+			((long)(new_rlim.rlim_cur - old_rlim->rlim_cur) >>
+				PAGE_SHIFT);
+		if (newpages > 0 && !vm_enough_memory(newpages))
+			/* We should really return EAGAIN or ENOMEM. */
+			return -EPERM;
+		if (newpages < 0)
+			vm_release_memory(-newpages);
 	}
 	*old_rlim = new_rlim;
 	return 0;
Index: linux/mm/mmap.c
diff -u linux/mm/mmap.c:1.1.1.1 linux/mm/mmap.c:1.4
--- linux/mm/mmap.c:1.1.1.1	Fri Mar 24 15:21:39 2000
+++ linux/mm/mmap.c	Thu Mar 30 15:01:17 2000
@@ -39,12 +39,27 @@
 /* SLAB cache for vm_area_struct's. */
 kmem_cache_t *vm_area_cachep;
 
-int sysctl_overcommit_memory;
+int sysctl_overcommit_memory = -1;
 
+/* Unfortunately these need to be longs so we need a spinlock. */
+unsigned long vm_reserved = 0;
+spinlock_t vm_lock = SPIN_LOCK_UNLOCKED;
+
+unsigned long vm_total(void) 
+{
+	unsigned long free;
+
+	free = atomic_read(&buffermem_pages);
+	free += atomic_read(&page_cache_size);
+	free += nr_free_pages();
+	free += nr_swap_pages;
+	return free;
+}
+
 /* Check that a process has enough memory to allocate a
  * new virtual mapping.
  */
-int vm_enough_memory(long pages)
+long vm_enough_memory(long pages)
 {
 	/* Stupid algorithm to decide if we have enough memory: while
 	 * simple, it hopefully works in most obvious cases.. Easy to
@@ -55,18 +70,42 @@
 	 * (buffers+cache), use the minimum values.  Allow an extra 2%
 	 * of num_physpages for safety margin.
 	 */
+	/*
+	 * Reserve some pages.  Only allow vm_reserved to exceed
+	 * vm_total if we're root.
+	 */
 
-	long free;
-	
-        /* Sometimes we want to use more memory than we have. */
-	if (sysctl_overcommit_memory)
-	    return 1;
+	int flags;
+	long free = 0;
 
-	free = atomic_read(&buffermem_pages);
-	free += atomic_read(&page_cache_size);
-	free += nr_free_pages();
-	free += nr_swap_pages;
-	return free > pages;
+	spin_lock_irqsave(&vm_lock, flags);
+	free  = vm_total();
+	switch (sysctl_overcommit_memory) {
+	case -2:
+		/* Don't overcommit at all, even root */
+		if (vm_reserved + pages > free) 
+			pages = 0;
+		/* FALLTHROUGH */
+	case -1:
+	default:
+		/* Don't overcommit at all */
+		if (vm_reserved + pages > free &&
+			current->uid) 
+			pages = 0;
+		/* FALLTHROUGH */
+	case 0:
+		/* Only overcommit a little */
+		if (pages > free)
+			pages = 0;
+		/* FALLTHROUGH */
+	case 1:
+		/* Always overcommit */
+		vm_reserved += pages;
+		break;
+	}
+	spin_unlock_irqrestore(&vm_lock, flags);
+
+	return pages;
 }
 
 /* Remove one vm structure from the inode's i_mmap ring. */
@@ -124,10 +163,6 @@
 	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
 		goto out;
 
-	/* Check if we have enough memory.. */
-	if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT))
-		goto out;
-
 	/* Ok, looks good - let it rip. */
 	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
 		goto out;
@@ -166,6 +201,7 @@
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma;
+	long reserved = 0;
 	int error;
 
 	if (file && (!file->f_op || !file->f_op->mmap))
@@ -290,7 +326,7 @@
 	/* Private writable mapping? Check memory availability.. */
 	if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
 	    !(flags & MAP_NORESERVE)				 &&
-	    !vm_enough_memory(len >> PAGE_SHIFT))
+	    !(reserved = vm_enough_memory(len >> PAGE_SHIFT)))
 		goto free_vma;
 
 	if (file) {
@@ -344,6 +380,7 @@
 	zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
 	flush_tlb_range(mm, vma->vm_start, vma->vm_end);
 free_vma:
+	vm_release_memory(reserved);
 	kmem_cache_free(vm_area_cachep, vma);
 	return error;
 }
@@ -522,6 +559,9 @@
 	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
 	if (area->vm_flags & VM_LOCKED)
 		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
+	if ((area->vm_flags & (VM_GROWSDOWN | VM_WRITE | VM_SHARED)) 
+		== VM_WRITE)
+		vm_release_memory(len >> PAGE_SHIFT);
 
 	/* Unmapping the whole area. */
 	if (addr == area->vm_start && end == area->vm_end) {
@@ -744,7 +784,7 @@
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma;
-	unsigned long flags, retval;
+	unsigned long flags, retval, reserved = 0;
 
 	len = PAGE_ALIGN(len);
 	if (!len)
@@ -775,16 +815,17 @@
 	if (mm->map_count > MAX_MAP_COUNT)
 		return -ENOMEM;
 
-	if (!vm_enough_memory(len >> PAGE_SHIFT))
+	if (!(reserved = vm_enough_memory(len >> PAGE_SHIFT)))
 		return -ENOMEM;
 
 	/*
 	 * create a vma struct for an anonymous mapping
 	 */
 	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
-	if (!vma)
+	if (!vma) {
+		vm_release_memory(reserved);
 		return -ENOMEM;
-
+	}
 	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
@@ -858,6 +899,9 @@
 		zap_page_range(mm, start, size);
 		if (mpnt->vm_file)
 			fput(mpnt->vm_file);
+		if ((mpnt->vm_flags & (VM_GROWSDOWN | VM_WRITE | VM_SHARED)) 
+			== VM_WRITE)
+			vm_release_memory(size >> PAGE_SHIFT);
 		kmem_cache_free(vm_area_cachep, mpnt);
 		mpnt = next;
 	}
Index: linux/mm/mremap.c
diff -u linux/mm/mremap.c:1.1.1.1 linux/mm/mremap.c:1.2
--- linux/mm/mremap.c:1.1.1.1	Fri Mar 24 15:21:39 2000
+++ linux/mm/mremap.c	Wed Mar 29 15:00:06 2000
@@ -13,8 +13,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 
-extern int vm_enough_memory(long pages);
-
 static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t * pgd;
@@ -171,7 +169,7 @@
 	unsigned long flags, unsigned long new_addr)
 {
 	struct vm_area_struct *vma;
-	unsigned long ret = -EINVAL;
+	unsigned long ret = -EINVAL, reserved = 0;
 
 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
 		goto out;
@@ -239,7 +237,7 @@
 	/* Private writable mapping? Check memory availability.. */
 	if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
 	    !(flags & MAP_NORESERVE)				 &&
-	    !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT))
+	    !(reserved = vm_enough_memory((new_len - old_len) >> PAGE_SHIFT)))
 		goto out;
 
 	/* old_len exactly to the end of the area..
@@ -264,6 +262,7 @@
 						   addr + new_len);
 			}
 			ret = addr;
+			reserved = 0;
 			goto out;
 		}
 	}
@@ -280,8 +279,12 @@
 				goto out;
 		}
 		ret = move_vma(vma, addr, old_len, new_len, new_addr);
+		if (ret != -ENOMEM) 
+			reserved = 0;
 	}
 out:
+	if (reserved)
+		vm_release_memory(reserved);
 	return ret;
 }
 
Index: linux/mm/swapfile.c
diff -u linux/mm/swapfile.c:1.1.1.1 linux/mm/swapfile.c:1.4
--- linux/mm/swapfile.c:1.1.1.1	Fri Mar 24 15:21:39 2000
+++ linux/mm/swapfile.c	Thu Mar 30 16:04:10 2000
@@ -17,6 +17,8 @@
 
 #include <asm/pgtable.h>
 
+extern int sysctl_overcommit_memory;
+
 spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
 unsigned int nr_swapfiles = 0;
 
@@ -444,7 +446,7 @@
 {
 	struct swap_info_struct * p = NULL;
 	struct dentry * dentry;
-	int i, type, prev;
+	int i, type, prev, flags;
 	int err;
 	
 	if (!capable(CAP_SYS_ADMIN))
@@ -490,7 +492,18 @@
 	nr_swap_pages -= p->pages;
 	swap_list_unlock();
 	p->flags = SWP_USED;
-	err = try_to_unuse(type);
+
+	/* Don't allow removal of swap if it will cause overcommit */
+	spin_lock_irqsave(&vm_lock, flags);
+	if ((sysctl_overcommit_memory < 0) && 
+		(vm_reserved > vm_total())) {
+		spin_unlock_irqrestore(&vm_lock, flags);
+		err = -ENOMEM;
+	} else {
+		spin_unlock_irqrestore(&vm_lock, flags);
+		err = try_to_unuse(type);
+	}
+
 	if (err) {
 		/* re-insert swap space back into swap_list */
 		swap_list_lock();
@@ -595,6 +608,7 @@
 	unsigned long maxpages;
 	int swapfilesize;
 	struct block_device *bdev = NULL;
+	int flags;
 	
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;



-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/