[LWN Logo]

Date:	Tue, 4 Apr 2000 17:43:11 -0300 (BRST)
From:	Rik van Riel <riel@conectiva.com.br>
To:	linux-kernel@vger.rutgers.edu
Subject: [PATCH] fair scheduler for 2.3.99-3

Hi,

here's a simple, low-overhead, implementation of a fair
scheduler for Linux. The main scheduling path is not
changed at all and recalculation is probably no more
than 2 or 3 times as expensive as currently. (but the
recalculation is only run a few times a second on CPU
loaded machines and doesn't matter if we've got some
idle time anyway)

When the machine isn't overloaded, nice levels and
everything else works as it does currently, but when
the machine is overloaded, the fair scheduler will do
its job to make sure CPU time is divided fairly between
users.

Pros:
- one user can no longer dominate CPU usage on a
  machine by running lots of processes (also goes
  for exploding daemons)
- can be switched on/off in /proc _or_ completely
  CONFIGed out -- no overhead if not used
- low overhead when used
- the main scheduler path is unchanged
- in non-overloaded situations, nice levels still
  work exactly as expected

Cons:
- root cpu usage cannot be put under quota yet
  (but I'll work on it for future versions)
- doesn't work for all daemons (aren't put under
  a new user_struct)
- cpu usage for the different processes "within"
  a user isn't always divided as expected, but
  usually follow the different nice levels pretty
  well (and still better than it would if the fair
  scheduler isn't active :))

When you config it in, don't forget to:
# echo 1 > /proc/sys/kernel/fairsched
to actually switch on the fair scheduler...

The patch has been heavily abused for a few days now, so
most bugs should be gone. A version for 2.2.15pre<new> is
in the other post.

regards,

Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.

Wanna talk about the kernel?  irc.openprojects.net / #kernelnewbies
http://www.conectiva.com/		http://www.surriel.com/


--- linux-2.3.99-pre3/kernel/sched.c.orig	Mon Mar 27 14:05:25 2000
+++ linux-2.3.99-pre3/kernel/sched.c	Tue Apr  4 15:43:40 2000
@@ -41,6 +41,10 @@
 
 extern void mem_use(void);
 
+#ifdef CONFIG_FAIRSCHED
+int fairsched = 0; /* toggle fair scheduler on/off */
+#endif /* CONFIG_FAIRSCHED */
+
 /*
  *	Init task must be ok at boot for the ix86 as we will check its signals
  *	via the SMP irq return path.
@@ -61,6 +65,7 @@
  */
 spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED;  /* second */
 rwlock_t tasklist_lock = RW_LOCK_UNLOCKED;	/* third */
+spinlock_t recalc_lock = SPIN_LOCK_UNLOCKED;
 
 static LIST_HEAD(runqueue_head);
 
@@ -439,7 +444,7 @@
 	struct task_struct *prev, *next, *p;
 	struct list_head *tmp;
 	int this_cpu, c;
-
+	
 	if (!current->active_mm) BUG();
 	if (tq_scheduler)
 		goto handle_tq_scheduler;
@@ -509,6 +514,7 @@
 	/* Do we need to re-calculate counters? */
 	if (!c)
 		goto recalculate;
+skipped_recalculate:
 	/*
 	 * from this point on nothing can prevent us from
 	 * switching to the next task, save this fact in
@@ -600,10 +606,75 @@
 	{
 		struct task_struct *p;
 		spin_unlock_irq(&runqueue_lock);
-		read_lock(&tasklist_lock);
-		for_each_task(p)
-			p->counter = (p->counter >> 1) + p->priority;
-		read_unlock(&tasklist_lock);
+#ifdef CONFIG_FAIRSCHED
+	/*
+	 * Simple, low-overhead fair share scheduler. It works by handing
+	 * out CPU time like we do at the normal recalculation.
+	 * The catch is that we move the list head (where the for_each_task()
+	 * loop starts) to _after_ the first task where we ran out of quota.
+	 * This means that if a user has too many runnable processes, his
+	 * tasks will get extra CPU time here in turns. -- Rik
+	 */
+		if (fairsched) {
+			struct user_struct *up;
+			long oldcounter;
+			if (!spin_trylock(&recalc_lock)) {
+				/* another cpu is recalculating */
+				spin_lock_irq(&runqueue_lock);
+			 	goto skipped_recalculate;
+			}
+			read_lock(&uidhash_lock);
+			for_each_user_struct(up)
+				up->move_task = NULL;
+			read_unlock(&uidhash_lock);
+
+			write_lock_irq(&tasklist_lock);
+			for_each_task(p) {
+			    up = p->user;
+			    if (!up) {
+				p->counter = (p->counter>>1) + p->priority;
+				continue;
+			    }
+			    if (!up->cpu_ticks)
+				continue;
+			    if (p->counter != (p->counter >> 1) + p->priority) {
+				oldcounter = p->counter;
+			        p->counter = (p->counter >> 1) + p->priority;
+				up->cpu_ticks += oldcounter;
+				up->cpu_ticks -= p->counter;
+				if (p->counter == oldcounter)
+				    BUG();
+				if (!up->move_task)
+				    up->move_task = p;
+				if (up->cpu_ticks < 0) {
+				    p->counter += up->cpu_ticks;
+				    up->cpu_ticks = 0;
+				}
+			    }
+			}
+
+			read_lock(&uidhash_lock);
+			for_each_user_struct(up) {
+			    if (!up->cpu_ticks) {
+				if (!up->move_task)
+				    BUG();
+				move_task_last(up);
+			    }
+			    /* replace DEF_PRIORITY with user quota */
+			    up->cpu_ticks = (up->cpu_ticks>>1) + DEF_PRIORITY;
+			}
+			read_unlock(&uidhash_lock);
+			write_unlock_irq(&tasklist_lock);
+			spin_unlock(&recalc_lock);
+		} else
+#endif /* CONFIG_FAIRSCHED */
+		{
+			read_lock(&tasklist_lock);
+			for_each_task(p)
+			    if (p->counter != (p->counter >> 1) + p->priority)
+				p->counter = (p->counter >> 1) + p->priority;
+			read_unlock(&tasklist_lock);
+		}
 		spin_lock_irq(&runqueue_lock);
 	}
 	goto repeat_schedule;
--- linux-2.3.99-pre3/kernel/fork.c.orig	Mon Mar 27 16:05:45 2000
+++ linux-2.3.99-pre3/kernel/fork.c	Sun Apr  2 19:30:27 2000
@@ -17,6 +17,7 @@
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
+#include <linux/sched.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -44,13 +45,16 @@
  */
 #define UIDHASH_SZ	(PIDHASH_SZ >> 2)
 
-static struct user_struct {
-	atomic_t count;
-	struct user_struct *next, **pprev;
-	unsigned int uid;
-} *uidhash[UIDHASH_SZ];
+struct user_struct *uidhash[UIDHASH_SZ];
+struct user_struct uidhead = {
+	ATOMIC_INIT(0),
+	&uidhead, (struct user_struct **) &uidhead,
+	&uidhead, &uidhead,
+	NULL, 0,
+	DEF_PRIORITY,
+};
 
-spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
+rwlock_t uidhash_lock = RW_LOCK_UNLOCKED;
 
 kmem_cache_t *uid_cachep;
 
@@ -65,6 +69,10 @@
 		uidhash[hashent]->pprev = &up->next;
 	up->pprev = &uidhash[hashent];
 	uidhash[hashent] = up;
+	up->l_next = uidhead.l_next;
+	up->l_prev = &uidhead;
+	uidhead.l_next->l_prev = up;
+	uidhead.l_next = up;
 }
 
 static inline void uid_hash_remove(struct user_struct *up)
@@ -72,6 +80,8 @@
 	if(up->next)
 		up->next->pprev = up->pprev;
 	*up->pprev = up->next;
+	up->l_prev->l_next = up->l_next;
+	up->l_next->l_prev = up->l_prev;
 }
 
 static inline struct user_struct *uid_hash_find(unsigned short uid, unsigned int hashent)
@@ -111,12 +121,12 @@
 	if (up) {
 		p->user = NULL;
 		if (atomic_dec_and_test(&up->count)) {
-			spin_lock(&uidhash_lock);
+			write_lock(&uidhash_lock);
 			if (uid_hash_free(up)) {
 				uid_hash_remove(up);
 				kmem_cache_free(uid_cachep, up);
 			}
-			spin_unlock(&uidhash_lock);
+			write_unlock(&uidhash_lock);
 		}
 	}
 }
@@ -126,9 +136,9 @@
 	unsigned int hashent = uidhashfn(p->uid);
 	struct user_struct *up;
 
-	spin_lock(&uidhash_lock);
+	read_lock(&uidhash_lock);
 	up = uid_hash_find(p->uid, hashent);
-	spin_unlock(&uidhash_lock);
+	read_unlock(&uidhash_lock);
 
 	if (!up) {
 		struct user_struct *new;
@@ -138,12 +148,13 @@
 			return -EAGAIN;
 		new->uid = p->uid;
 		atomic_set(&new->count, 1);
+		new->cpu_ticks = DEF_PRIORITY;
 
 		/*
 		 * Before adding this, check whether we raced
 		 * on adding the same user already..
 		 */
-		spin_lock(&uidhash_lock);
+		write_lock(&uidhash_lock);
 		up = uid_hash_find(p->uid, hashent);
 		if (up) {
 			kmem_cache_free(uid_cachep, new);
@@ -151,7 +162,7 @@
 			uid_hash_insert(new, hashent);
 			up = new;
 		}
-		spin_unlock(&uidhash_lock);
+		write_unlock(&uidhash_lock);
 
 	}
 	p->user = up;
@@ -611,6 +622,7 @@
 	lock_kernel();
 
 	retval = -EAGAIN;
+
 	if (p->user) {
 		if (atomic_read(&p->user->count) >= p->rlim[RLIMIT_NPROC].rlim_cur)
 			goto bad_fork_free;
--- linux-2.3.99-pre3/kernel/sysctl.c.orig	Mon Mar 27 16:56:50 2000
+++ linux-2.3.99-pre3/kernel/sysctl.c	Mon Mar 27 17:03:54 2000
@@ -46,6 +46,7 @@
 extern int sysctl_overcommit_memory;
 extern int max_threads;
 extern int nr_queued_signals, max_queued_signals;
+extern int fairsched;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -222,6 +223,10 @@
 	{KERN_OVERFLOWGID, "overflowgid", &overflowgid, sizeof(int), 0644, NULL,
 	 &proc_dointvec_minmax, &sysctl_intvec, NULL,
 	 &minolduid, &maxolduid},
+#ifdef CONFIG_FAIRSCHED
+	{KERN_FAIRSCHED, "fairsched", &fairsched, sizeof(int),
+	 0644, NULL, &proc_dointvec},
+#endif
 	{0}
 };
 
--- linux-2.3.99-pre3/include/linux/sched.h.orig	Mon Mar 27 14:07:14 2000
+++ linux-2.3.99-pre3/include/linux/sched.h	Sun Apr  2 17:32:14 2000
@@ -255,7 +255,14 @@
  * Right now it is only used to track how many processes a
  * user has, but it has the potential to track memory usage etc.
  */
-struct user_struct;
+struct user_struct {
+	atomic_t count;
+	struct user_struct *next, **pprev;
+	struct user_struct *l_next, *l_prev;
+	struct task_struct *move_task;
+	unsigned int uid;
+	long cpu_ticks;
+};
 
 struct task_struct {
 /* these are hardcoded - don't touch */
@@ -448,6 +455,8 @@
 /* PID hashing. (shouldnt this be dynamic?) */
 #define PIDHASH_SZ (4096 >> 2)
 extern struct task_struct *pidhash[PIDHASH_SZ];
+extern struct user_struct uidhead;
+extern rwlock_t uidhash_lock;
 
 #define pid_hashfn(x)	((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
 
@@ -822,6 +831,26 @@
 #define for_each_task(p) \
 	for (p = &init_task ; (p = p->next_task) != &init_task ; )
 
+#define for_each_user_struct(up) \
+	for (up = &uidhead ; (up = up->l_next) != &uidhead ; )
+
+static inline void move_task_last(struct user_struct * up)
+{
+	struct task_struct *p = up->move_task;
+	struct task_struct *pn = p->next_task;
+	struct task_struct *pp = p->prev_task;
+	struct task_struct *ip = init_task.prev_task;
+
+	if (p != ip) {
+		/* otherwise we'd drop p from the task list */
+		pn->prev_task = pp;
+		pp->next_task = pn;
+		p->prev_task = ip;
+		p->next_task = &init_task;
+		init_task.prev_task = p;
+		ip->next_task = p;
+	}
+}
 
 static inline void del_from_runqueue(struct task_struct * p)
 {
--- linux-2.3.99-pre3/include/linux/sysctl.h.orig	Mon Mar 27 16:55:49 2000
+++ linux-2.3.99-pre3/include/linux/sysctl.h	Mon Mar 27 16:56:41 2000
@@ -112,6 +112,7 @@
 	KERN_OVERFLOWUID=46,	/* int: overflow UID */
 	KERN_OVERFLOWGID=47,	/* int: overflow GID */
 	KERN_SHMPATH=48,	/* string: path to shm fs */
+	KERN_FAIRSCHED=49,	/* turn fair scheduler on/off */
 };
 
 
--- linux-2.3.99-pre3/arch/i386/config.in.orig	Mon Mar 27 17:04:26 2000
+++ linux-2.3.99-pre3/arch/i386/config.in	Mon Mar 27 17:06:56 2000
@@ -138,6 +138,9 @@
    source drivers/pcmcia/Config.in
 fi
 
+if [ "$CONFIG_EXPERIMENTAL" = "y" ] ; then
+   bool 'Fair scheduler' CONFIG_FAIRSCHED
+fi
 bool 'System V IPC' CONFIG_SYSVIPC
 bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
 bool 'Sysctl support' CONFIG_SYSCTL


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/