[LWN Logo]

Date:	Wed, 24 May 2000 17:29:56 -0400
From:	John Baboval <baboval@mclinux.com>
To:	linux-kernel@vger.rutgers.edu, torvalds@transmeta.com
Subject: [patch] profile enhancements, please read!


--dDRMvlgZJXvWKvBx
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

The attached patch makes the following enhancements to the kernel profiler:

- Dynamic allocation of the profile buffer.

     This allows for profiling on a production system without the continuous
     performance hit that the profiler causes. You can turn it on, get you
     data, and then turn it off again....
     I submitted this as a patch to 2.3.99pre8, but nobody commented, and it
     didn't get applied. I'm resubmitting because the further enhancements
     are impractical without this. Control of the buffer is via a
     /proc/sys/kernel entry 'prof_shift'. you can write the same value to
     this file as you would pass to the profile= parameter at boot time. To
     turn off profiling, write a zero to the file. I have a second
     implementation of this feature as a system call which I feel is
     cleaner, though less practical. Any comments on this matter would be
     appreciated.
     
- Selective profiling by context
    
     This allows the kernel profiler to only be activated when the context
     matches the constraints specified in the new files
     /proc/sys/kernel/prof_pid and prof_name. The profiler is triggered when
     the value in prof_pid equals the pid, or when the value in prof_name
     matches the comm entry in the task_struct. If prof_name is NULL
     (default) and prof_pid is -1 (default) the profiler acts as usual.
     The benefit of this is being able to see what section of the kernel is
     effecting the particular task you are trying to optimize.
     I also am interested in hearing wether people would like the opposite of
     this, i.e. all processes except one specified.... This would be a
     simple addition. This would be useful for testing with a client and
     server running on the same machine (if this is all you have to work
     with).
     
- Percentage readouts for readprofile

     These aren't included in this patch, as it's not directly kernel
     related. If anyone wants these they can e-mail me. Basically this adds
     a column to the readprofile output that indicates the percentage of
     time spent in each area. 

Please review and test/apply this patch.

-- 

-John 
<baboval@missioncriticallinux.com>

--dDRMvlgZJXvWKvBx
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="patch-2.3.99pre9-profile"

diff -u -b --recursive --new-file 2.3.99pre9_clean/arch/alpha/kernel/irq_impl.h linux/arch/alpha/kernel/irq_impl.h
--- 2.3.99pre9_clean/arch/alpha/kernel/irq_impl.h	Sun Feb 27 16:28:01 2000
+++ linux/arch/alpha/kernel/irq_impl.h	Wed May 24 16:35:56 2000
@@ -41,6 +41,8 @@
 extern void handle_irq(int irq, struct pt_regs * regs);
 
 extern unsigned long prof_cpu_mask;
+extern char prof_name[16];
+extern int prof_pid;
 
 static inline void
 alpha_do_profile(unsigned long pc)
@@ -49,6 +51,11 @@
 
 	if (!prof_buffer)
 		return;
+
+        if(prof_pid != -1 || prof_name[0] != 0)
+                if((current->pid != prof_pid) && 
+	           (strcmp(current->comm, prof_name) != 0))
+                        return;
 
 	/*
 	 * Only measure the CPUs specified by /proc/irq/prof_cpu_mask.
diff -u -b --recursive --new-file 2.3.99pre9_clean/fs/proc/proc_misc.c linux/fs/proc/proc_misc.c
--- 2.3.99pre9_clean/fs/proc/proc_misc.c	Fri May 12 14:36:27 2000
+++ linux/fs/proc/proc_misc.c	Wed May 24 16:40:59 2000
@@ -590,7 +590,7 @@
 	return count;
 }
 
-static struct file_operations proc_profile_operations = {
+struct file_operations proc_profile_operations = {
 	read:		read_profile,
 	write:		write_profile,
 };
diff -u -b --recursive --new-file 2.3.99pre9_clean/include/asm-i386/hw_irq.h linux/include/asm-i386/hw_irq.h
--- 2.3.99pre9_clean/include/asm-i386/hw_irq.h	Tue May 23 17:18:47 2000
+++ linux/include/asm-i386/hw_irq.h	Wed May 24 16:33:43 2000
@@ -181,6 +181,9 @@
 extern unsigned int * prof_buffer;
 extern unsigned long prof_len;
 extern unsigned long prof_shift;
+extern char prof_name[16];
+extern int prof_pid;
+
 
 /*
  * x86 profiling function, SMP safe. We might want to do this in
@@ -190,6 +193,11 @@
 {
 	if (!prof_buffer)
 		return;
+
+        if(prof_pid != -1 || prof_name[0] != 0)
+                if((current->pid != prof_pid) && 
+	           (strcmp(current->comm, prof_name) != 0))
+                        return;
 
 	/*
 	 * Only measure the CPUs specified by /proc/irq/prof_cpu_mask.
diff -u -b --recursive --new-file 2.3.99pre9_clean/include/linux/sysctl.h linux/include/linux/sysctl.h
--- 2.3.99pre9_clean/include/linux/sysctl.h	Fri May 12 14:36:27 2000
+++ linux/include/linux/sysctl.h	Wed May 24 16:42:39 2000
@@ -112,6 +112,9 @@
 	KERN_OVERFLOWUID=46,	/* int: overflow UID */
 	KERN_OVERFLOWGID=47,	/* int: overflow GID */
 	KERN_SHMPATH=48,	/* string: path to shm fs */
+	KERN_PROFSWITCH=49,     /* int: sets the profile shift value */
+        KERN_PROF_PID=50,       /* int: sets the pid to profile */
+        KERN_PROF_NAME=51,      /* string: sets the process name to profile */
 };
 
 
diff -u -b --recursive --new-file 2.3.99pre9_clean/kernel/Makefile linux/kernel/Makefile
--- 2.3.99pre9_clean/kernel/Makefile	Wed Mar 22 12:39:11 2000
+++ linux/kernel/Makefile	Wed May 24 16:43:09 2000
@@ -10,7 +10,7 @@
 O_TARGET := kernel.o
 O_OBJS    = sched.o dma.o fork.o exec_domain.o panic.o printk.o sys.o \
 	    module.o exit.o itimer.o info.o time.o softirq.o resource.o \
-	    sysctl.o acct.o capability.o ptrace.o timer.o
+	    sysctl.o acct.o capability.o ptrace.o timer.o profile.o
 
 OX_OBJS  += signal.o
 
diff -u -b --recursive --new-file 2.3.99pre9_clean/kernel/profile.c linux/kernel/profile.c
--- 2.3.99pre9_clean/kernel/profile.c	Wed Dec 31 19:00:00 1969
+++ linux/kernel/profile.c	Wed May 24 16:46:25 2000
@@ -0,0 +1,125 @@
+/* This is the runtime enabler/disabler function for the kernel profiler.
+ * The profile shift value is passed as a parameter; if it is 0 then profiling
+ * is disabled.
+ * 
+ * return values:
+ * 0 - Profiling is disabled
+ * 1 - Profiling successfuly enabled; buffer was allocated with kmalloc
+ * 2 - Profiling successfuly enabled; buffer was allocated with vmalloc
+ * 3 - bootmem profile buffer was left untouched
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/malloc.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <asm/page.h>
+
+#if defined(CONFIG_PROC_FS)
+
+extern unsigned int * prof_buffer;
+extern unsigned long prof_len;
+extern unsigned long prof_shift;
+unsigned int  prof_type = 0;
+extern struct proc_dir_entry proc_root;
+
+extern char _stext, _etext;
+extern struct file_operations proc_profile_operations;
+
+long prof_switch(long shift)
+{
+   
+       struct proc_dir_entry *entry;
+   
+       if (shift > 0) {
+	       unsigned int size;
+	  
+	       /* Don't create a new profile buffer/entry if one already exists,
+		* But still honor the request in case the resolution has changed */
+	  
+	       if(prof_buffer != NULL)
+	       {
+		       if(shift != prof_shift)
+		            prof_switch(0);
+		       else
+		            return prof_type;
+	       }
+	  
+	       prof_shift = shift;
+	  
+               /* only text is profiled */
+               prof_len = (unsigned long) &_etext - (unsigned long) &_stext;
+	  
+	       /* If prof_shift is too big then this is pointless. prof_shift should never be
+		* bigger than log base 2 of the buffer size. */
+#define log2(x) ffz(~(x))
+	  
+	       if((1 << prof_shift) >= prof_len * sizeof(unsigned int))
+	                prof_shift = log2(prof_len * sizeof(unsigned int)) - 1;
+               prof_len >>= prof_shift;
+      
+               size = prof_len * sizeof(unsigned int) + PAGE_SIZE-1;
+	        /* Use kmalloc if size < 512 * PAGE_SIZE, if it's bigger use vmalloc */
+	        if(size < 512 * PAGE_SIZE)
+	        {
+		        if(!(prof_buffer = kmalloc(size, GFP_KERNEL)))
+		        {
+			      if(!(prof_buffer = vmalloc(size)))
+			      {
+				      /* Bail... Can't get enought memory...*/
+				      return(0);
+			      }else prof_type = 2;
+			}else prof_type = 1;
+		}
+                else
+	        {
+		        if(!(prof_buffer = vmalloc(size)))
+		        {
+			        /* Bail... Can't get enough memory...*/
+			        return(0);
+		        }else prof_type = 2;
+		}
+	        /* Zero out the new profile buffer */
+	        memset(prof_buffer, 0, size);
+	  
+	        /* Register the /proc/profile entry */
+	        entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
+		if (entry) {
+		        entry->proc_fops = &proc_profile_operations;
+			entry->size = (1+prof_len) * sizeof(unsigned int);
+		}
+	  	return prof_type;
+
+        }
+        else {
+	        /* Turn off profiling */
+	        if(prof_buffer)
+	        {
+		     if(prof_type ==  1) /* Buffer was allocated with kmalloc */ {
+			     void *buffer;
+  		             remove_proc_entry("profile", &proc_root);
+			     buffer = prof_buffer;
+			     prof_buffer = NULL;
+		             kfree(buffer);
+			     prof_type = 0;
+		     }
+		     else if(prof_type == 2) /* buffer was allocated with vmalloc */ {
+			     void *buffer;
+               		     remove_proc_entry("profile", &proc_root);
+			     buffer = prof_buffer;
+			     prof_buffer = NULL;
+			     vfree(buffer);
+			     prof_type = 0;
+		     }
+		     else {
+			     /* This is a boottime profile buffer... */
+			     return(3);
+		     }
+		}
+	        return(0);
+	}
+       
+}
+#endif /* CONFIG_PROC_FS */
diff -u -b --recursive --new-file 2.3.99pre9_clean/kernel/sysctl.c linux/kernel/sysctl.c
--- 2.3.99pre9_clean/kernel/sysctl.c	Fri May 12 14:21:20 2000
+++ linux/kernel/sysctl.c	Wed May 24 16:53:18 2000
@@ -127,6 +127,12 @@
 extern int inodes_stat[];
 extern int dentry_stat[];
 
+static int profile;
+extern unsigned long prof_shift;
+extern long prof_switch(long prof_shift);
+extern int prof_pid;
+extern char prof_name[16];
+
 /* The default sysctl tables: */
 
 static ctl_table root_table[] = {
@@ -222,6 +228,14 @@
 	{KERN_OVERFLOWGID, "overflowgid", &overflowgid, sizeof(int), 0644, NULL,
 	 &proc_dointvec_minmax, &sysctl_intvec, NULL,
 	 &minolduid, &maxolduid},
+        {KERN_PROFSWITCH, "prof_shift", &profile, sizeof(int), 0644, NULL, 
+	 &proc_prof_switch},
+        {KERN_PROF_PID, "prof_pid", &prof_pid, sizeof(int), 0644, NULL, 
+	 &proc_dointvec},
+        {KERN_PROF_NAME, "prof_name", 
+	     prof_name, 
+	     16, 0644, NULL, &proc_doutsstring, 
+         &sysctl_string},
 	{0}
 };
 
@@ -796,6 +810,15 @@
     return do_proc_dointvec(table,write,filp,buffer,lenp,1,OP_SET);
 }
 
+int proc_prof_switch(ctl_table *table, int write, struct file *filp,
+		     void *buffer, size_t *lenp)
+{
+    int i, j = 0;
+    i = do_proc_dointvec(table,write,filp,buffer,lenp,1,OP_SET);
+    if(i >= 0) j = prof_switch((unsigned long)profile);
+    profile = prof_shift;
+    return j;
+}
 /*
  *	init may raise the set.
  */
diff -u -b --recursive --new-file 2.3.99pre9_clean/kernel/timer.c linux/kernel/timer.c
--- 2.3.99pre9_clean/kernel/timer.c	Fri May 12 14:21:20 2000
+++ linux/kernel/timer.c	Wed May 24 16:40:01 2000
@@ -71,6 +71,8 @@
 unsigned int * prof_buffer;
 unsigned long prof_len;
 unsigned long prof_shift;
+int prof_pid = -1;
+char prof_name[16];
 
 /*
  * Event timer code

--dDRMvlgZJXvWKvBx--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/