[LWN Logo]
[LWN.net]


This is probably the most important patch.  It's going to need some
more explanation from Andrea.

- Much more aggressive in shrinking the inode/dcache/quota caches

- Avoid pointlessly calling swap_out a zillion times if it is known
  to be failing.  (Should fix the "kswapd went crazy with no swap"
  problem).

- The oom_killer was killed.  Instead, we just allow allocations to fail.

- There's a special-case for the system-critical /sbin/init.  init
  will just keep spinning until memory is available.

- We now scan all mm's twice in swap_out.  Andrea's original
  changelog doesn't explain *why* this is done.


With this patch applied, the VM becomes totally unusable.  The
changes in aa-110-zone_accounting are also needed.

=====================================

--- 2.4.19-pre3/mm/vmscan.c~aa-096-swap_out	Tue Mar 19 19:48:54 2002
+++ 2.4.19-pre3-akpm/mm/vmscan.c	Tue Mar 19 19:49:10 2002
@@ -321,13 +321,13 @@ out_unlock:
 	return count;
 }
 
-static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone));
-static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)
+static int FASTCALL(swap_out(zone_t * classzone));
+static int swap_out(zone_t * classzone)
 {
 	int counter, nr_pages = SWAP_CLUSTER_MAX;
 	struct mm_struct *mm;
 
-	counter = mmlist_nr;
+	counter = mmlist_nr << 1;
 	do {
 		if (unlikely(current->need_resched)) {
 			__set_current_state(TASK_RUNNING);
@@ -363,15 +363,15 @@ empty:
 	return 0;
 }
 
-static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority));
-static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)
+static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
+static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout));
+static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)
 {
 	struct list_head * entry;
-	int max_scan = nr_inactive_pages / priority;
-	int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10);
+	int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio;
+	int max_mapped = vm_mapped_ratio * nr_pages;
 
-	spin_lock(&pagemap_lru_lock);
-	while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) {
+	while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
 		struct page * page;
 
 		if (unlikely(current->need_resched)) {
@@ -499,34 +499,49 @@ static int shrink_cache(int nr_pages, zo
 		spin_lock(&pagecache_lock);
 
 		/*
-		 * this is the non-racy check for busy page.
+		 * This is the non-racy check for busy page.
+		 * It is critical to check PageDirty _after_ we made sure
+		 * the page is freeable so not in use by anybody.
+		 * At this point we're guaranteed that page->buffers is NULL,
+		 * nobody can refill page->buffers under us because we still
+		 * hold the page lock.
 		 */
-		if (!page->mapping || !is_page_cache_freeable(page)) {
+		if (!page->mapping || page_count(page) > 1) {
 			spin_unlock(&pagecache_lock);
 			UnlockPage(page);
 page_mapped:
-			if (--max_mapped >= 0)
-				continue;
+			if (--max_mapped < 0) {
+				spin_unlock(&pagemap_lru_lock);
 
-			/*
-			 * Alert! We've found too many mapped pages on the
-			 * inactive list, so we start swapping out now!
-			 */
-			spin_unlock(&pagemap_lru_lock);
-			swap_out(priority, gfp_mask, classzone);
-			return nr_pages;
-		}
+				nr_pages -= kmem_cache_reap(gfp_mask);
+				if (nr_pages <= 0)
+					goto out;
 
-		/*
-		 * It is critical to check PageDirty _after_ we made sure
-		 * the page is freeable* so not in use by anybody.
-		 */
+				shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
+				shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
+#ifdef CONFIG_QUOTA
+				shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
+#endif
+
+				if (!*failed_swapout)
+					*failed_swapout = !swap_out(classzone);
+
+				max_mapped = nr_pages * vm_mapped_ratio;
+
+				spin_lock(&pagemap_lru_lock);
+				refill_inactive(nr_pages, classzone);
+			}
+			continue;
+			
+		}
 		if (PageDirty(page)) {
 			spin_unlock(&pagecache_lock);
 			UnlockPage(page);
 			continue;
 		}
 
+		__lru_cache_del(page);
+
 		/* point of no return */
 		if (likely(!PageSwapCache(page))) {
 			__remove_inode_page(page);
@@ -539,7 +554,6 @@ page_mapped:
 			swap_free(swap);
 		}
 
-		__lru_cache_del(page);
 		UnlockPage(page);
 
 		/* effectively free the page here */
@@ -551,6 +565,7 @@ page_mapped:
 	}
 	spin_unlock(&pagemap_lru_lock);
 
+ out:
 	return nr_pages;
 }
 
@@ -561,13 +576,15 @@ page_mapped:
  * We move them the other way when we see the
  * reference bit on the page.
  */
-static void refill_inactive(int nr_pages)
+static void refill_inactive(int nr_pages, zone_t * classzone)
 {
 	struct list_head * entry;
+	unsigned long ratio;
+
+	ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1);
 
-	spin_lock(&pagemap_lru_lock);
 	entry = active_list.prev;
-	while (nr_pages && entry != &active_list) {
+	while (ratio && entry != &active_list) {
 		struct page * page;
 
 		page = list_entry(entry, struct page, lru);
@@ -584,54 +601,63 @@ static void refill_inactive(int nr_pages
 		add_page_to_inactive_list(page);
 		SetPageReferenced(page);
 	}
-	spin_unlock(&pagemap_lru_lock);
+	if (entry != &active_list) {
+		list_del(&active_list);
+		list_add(&active_list, entry);
+	}
 }
 
-static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
-static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
+static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout));
+static int shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)
 {
-	int chunk_size = nr_pages;
-	unsigned long ratio;
-
 	nr_pages -= kmem_cache_reap(gfp_mask);
 	if (nr_pages <= 0)
-		return 0;
-
-	nr_pages = chunk_size;
-	/* try to keep the active list 2/3 of the size of the cache */
-	ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);
-	refill_inactive(ratio);
+		goto out;
 
-	nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
-	if (nr_pages <= 0)
-		return 0;
+	spin_lock(&pagemap_lru_lock);
+	refill_inactive(nr_pages, classzone);
 
-	shrink_dcache_memory(priority, gfp_mask);
-	shrink_icache_memory(priority, gfp_mask);
-#ifdef CONFIG_QUOTA
-	shrink_dqcache_memory(priority, gfp_mask);
-#endif
+	nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout);
 
+ out:
 	return nr_pages;
 }
 
+static int check_classzone_need_balance(zone_t * classzone);
+
 int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)
 {
-	int priority = 6;
-	int nr_pages = SWAP_CLUSTER_MAX;
-
 	gfp_mask = pf_gfp_mask(gfp_mask);
-	do {
-		nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
-		if (nr_pages <= 0)
-			return 1;
-	} while (--priority);
 
-	/*
-	 * Hmm.. Cache shrink failed - time to kill something?
-	 * Mhwahahhaha! This is the part I really like. Giggle.
-	 */
-	out_of_memory();
+	for (;;) {
+		int tries = vm_passes;
+		int failed_swapout = !(gfp_mask & __GFP_IO);
+		int nr_pages = SWAP_CLUSTER_MAX;
+
+		do {
+			nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout);
+			if (nr_pages <= 0)
+				return 1;
+
+			shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
+			shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
+#ifdef CONFIG_QUOTA
+			shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
+#endif
+
+			if (!failed_swapout)
+				failed_swapout = !swap_out(classzone);
+		} while (--tries);
+
+		if (likely(current->pid != 1))
+			break;
+		if (!check_classzone_need_balance(classzone))
+			break;
+		current->policy |= SCHED_YIELD;
+		__set_current_state(TASK_RUNNING);
+		schedule();
+	}
+
 	return 0;
 }
 
--- 2.4.19-pre3/mm/oom_kill.c~aa-096-swap_out	Tue Mar 19 19:48:54 2002
+++ 2.4.19-pre3-akpm/mm/oom_kill.c	Tue Mar 19 19:49:13 2002
@@ -21,6 +21,8 @@
 #include <linux/swapctl.h>
 #include <linux/timex.h>
 
+#if 0		/* Nothing in this file is used */
+
 /* #define DEBUG */
 
 /**
@@ -243,3 +245,5 @@ reset:
 	first = now;
 	count = 0;
 }
+
+#endif	/* Unused file */
--- 2.4.19-pre3/include/linux/mmzone.h~aa-096-swap_out	Tue Mar 19 19:48:54 2002
+++ 2.4.19-pre3-akpm/include/linux/mmzone.h	Tue Mar 19 19:48:54 2002
@@ -41,7 +41,18 @@ typedef struct zone_struct {
 	spinlock_t		lock;
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
-	int			need_balance;
+
+	/*
+	 * The below fields are protected by different locks (or by
+	 * no lock at all like need_balance), so they're longs to
+	 * provide an atomic granularity against each other on
+	 * all architectures.
+	 */
+	unsigned long		need_balance;
+	/* protected by the pagemap_lru_lock */
+	unsigned long		nr_active_pages, nr_inactive_pages;
+	/* protected by the pagecache_lock */
+	unsigned long		nr_cache_pages;
 
 	/*
 	 * free areas of different sizes