[LWN Logo]

From:	Borislav Deianov <borislav@lix.polytechnique.fr>
Date:	Thu, 6 Apr 2000 00:42:27 +0200
To:	linux-kernel@vger.rutgers.edu
Subject: [RFC] fast atomic ps


--wq9mPyueHGvFACwf
Content-Type: text/plain; charset=us-ascii

Hi,

A while ago there was a patch by Erik Andersen that implemented a
/dev/ps device as a small functional replacement of /proc for embedded
systems. I'd like to propose another way of implementing such a device
that is also small and solves a couple of other problems as well:

- Reading and parsing the entire /proc tree is quite slow, as a result
top uses way too much CPU time on busy systems;

- If there is a lot of process activity (creation and death) while the
/proc tree is being read, the resulting view of the system is often
inconsistent. Imagine, for example, a program which does this:

	while(1) {
		for (i=0; i<1000; i++);
		if (fork())
			exit(0);
	}

It's possible that ps fails to show _any_ copies of this nasty if the
pid changes from high to low at exactly the right time.

In summary, I'd like something that is 1) small, 2) fast, 3) provides
an atomic snapshot of all the information in /proc/<pid>/*. I propose
a device /dev/procsnap that behaves as follows:

- when you open the device, the driver allocates some memory and
	copies all the data from /proc atomically;
- you can read the data at your leisure, it doesn't change 
	under you (in most cases a single read call is enough);
- the memory for the snapshot is released when the device is closed;
- many processes can open the device at the same time, they get
	separate snapshots.

I've attached a very preliminary implementation of the kernel driver
and a proof of concept ps that uses it. If there's interest I'll code
a more complete patch.

Best wishes,
Borislav

--wq9mPyueHGvFACwf
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="procsnap.c"

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <asm/uaccess.h>

#define PROCSNAP_VERSION "0.1"

MODULE_DESCRIPTION("Provides snapshots of process data");
MODULE_AUTHOR("Borislav Deianov <borislav@cs.cornell.edu>");

struct proc_struct {
	pid_t pid;
	unsigned long state;
	/* etc */
};

struct snap_struct {
	struct proc_struct *snapshot;
	unsigned int len;
};

static struct snap_struct *take_snapshot(void)
{
	struct snap_struct *snap;
	struct proc_struct *ptr;
	struct task_struct *p;
	
	snap = kmalloc(sizeof(*snap), GFP_KERNEL);
	if (snap == NULL)
		return NULL;

 repeat:
	snap->len = nr_threads * sizeof(struct proc_struct);
	/* some slack in case nr_threads increases (we get it for free) */
	snap->len = PAGE_ALIGN(snap->len);
	snap->snapshot = vmalloc(snap->len);
	if (snap->snapshot == NULL)
		return NULL;

	read_lock(&tasklist_lock);

	if (snap->len < nr_threads * sizeof(struct proc_struct)) {
		read_unlock(&tasklist_lock);
		vfree(snap->snapshot);
		goto repeat;
	}
	snap->len = nr_threads * sizeof(struct proc_struct);
	
	ptr = snap->snapshot;
	for_each_task(p) {
		ptr->pid = p->pid;
		ptr->state = p->state;

		ptr++;
	}

	read_unlock(&tasklist_lock);

	printk("took snapshot of %d processes\n", 
	       snap->len / sizeof(struct proc_struct));

	return snap;
}

static void release_snapshot(void *data)
{
	struct snap_struct *snap = (struct snap_struct *)data;
	vfree(snap->snapshot);
	kfree(snap);
}


static int procsnap_open (struct inode *inode, struct file *file)
{
        MOD_INC_USE_COUNT;

	if ((file->private_data = take_snapshot()) == NULL) {
		MOD_DEC_USE_COUNT;
		return -ENOMEM;
	}

	return 0;
}

static int procsnap_release (struct inode *inode, struct file *file)
{
	release_snapshot(file->private_data);

	MOD_DEC_USE_COUNT;
	return 0;
}

static ssize_t procsnap_read (struct file *file, char *buf, size_t count, 
			      loff_t *off)
{
	struct snap_struct *snap = (struct snap_struct *)(file->private_data);

	if (*off >= snap->len)
		return 0;
	if (*off + count > snap->len)
		count = snap->len - *off;
	if (copy_to_user(buf, (char *)snap->snapshot + *off, count) < 0)
		return -EFAULT;
        *off += count;
        return count;
}

static struct file_operations procsnap_fops = {
	open:		procsnap_open,
	release:	procsnap_release,
	read:		procsnap_read,
};

static struct miscdevice procsnap_miscdev = {
	minor:	248,		/* XXX get a real minor from HPA */
        name:	"procsnap",
        fops:	&procsnap_fops,
};

static int __init procsnap_init(void)
{
	int error;

	if ((error = misc_register(&procsnap_miscdev)) < 0) {
		printk(KERN_ERR "Unable to register procsnap device: %d\n",
		       error);
		return error;
	}

	printk("Procsnap v%s.\n", PROCSNAP_VERSION);

	return 0;
}       

static void __exit procsnap_cleanup(void)
{
	int error;

	if ((error = misc_deregister(&procsnap_miscdev)) < 0)
		printk(KERN_ERR "Unable to deregister procsnap devide : %d\n",
		       error);
}

module_init(procsnap_init);
module_exit(procsnap_cleanup);

--wq9mPyueHGvFACwf
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="mini-ps.c"

#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct proc_struct {
	pid_t pid;
	unsigned long state;
	/* etc */
};

#define BUF (16*sizeof(struct proc_struct))

int main(void)
{
	int fd, len, i;
	char buf[BUF];

	if ((fd = open("/dev/procsnap", O_RDONLY)) == -1) {
		perror("open");
		exit(EXIT_FAILURE);
	}

	while ((len = read(fd, buf, BUF)) > 0)
		for (i=0; i<len / sizeof(struct proc_struct); i++)
			printf("pid = %d, state = %lu\n",
			       ((struct proc_struct *)buf)[i].pid,
			       ((struct proc_struct *)buf)[i].state);

	if ((fd = close(fd)) == -1) {
		perror("close");
		exit(EXIT_FAILURE);
	}

	exit(EXIT_SUCCESS);
}

--wq9mPyueHGvFACwf--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/