From: Alexander Viro <viro@math.psu.edu> To: Linus Torvalds <torvalds@transmeta.com> Subject: [PATCH] alternative API for raw devices Date: Wed, 1 May 2002 16:12:20 -0400 (EDT) Cc: "Stephen C. Tweedie" <sct@redhat.com>, linux-kernel@vger.kernel.org New option: CONFIG_RAW (tristate) With that animal enabled you can say # mount -t raw /dev/sda1 /dev/<whatever> and get a raw device bound to sda1 visible on /dev/<whatever>. Old raw devices still work - drivers do not conflict. Actual IO code is pretty much copied from old driver. The main differences: * device is originally created with ownership/permissions of the block device we'd used; you can chmod/chown it at any time, obviously. * it's _not_ a character device - stat() will give you S_IFREG. To check that <foo> is a new-style raw device call statfs(2) and compare .f_type with rawfs magic (0x726177). It doesn't conflict with existing check for raw devices (stat(), check that it's a character device and compare major with RAW_MAJOR), so existing software can be taught to check for raw devices in backwards-compatible way. umount will undo the binding, obviously. The thing works and is very small (less than 3Kb text+data+bss). BTW, it can be built as module. I'm not sure if the name of config option is right - maybe CONFIG_RAW_FS would be better (with CONFIG_RAW_DEV added when and if we would want to make the old one conditional). If nothing else, it's interesting as example of doing driver-exported mini-fs instead of messing with ioctl(). Enjoy: diff -urN C12-0/fs/Config.help C12-current/fs/Config.help --- C12-0/fs/Config.help Wed May 1 15:34:58 2002 +++ C12-current/fs/Config.help Wed May 1 15:52:41 2002 @@ -6,6 +6,24 @@ <http://www.linuxdoc.org/docs.html#howto>. Probably the quota support is only useful for multi user systems. If unsure, say N. +CONFIG_RAW + If you say Y here, you will be able to work with raw devices without + any special tools - mount -t raw <block device> <file> will bind + a raw device with the block one and put it on top of file and umount <file> + will undo that. No magic control devices, no ioctls, just plain mount(2). + Old raw devices are still there - these drivers are completely independent. + If unsure, say Y. + + Note: to check that <file> is a new-style raw device you need either + look for raw mounted on <file> (in /proc/mounts or /etc/mtab) or + call statfs(<file>, &stat_buf) and compare stat_buf.f_type with + rawfs magic (0x726177). + + If you want to compile this as a module ( = code which can be + inserted in and removed from the running kernel whenever you want), + say M here and read <file:Documentation/modules.txt>. The module + will be called raw.o. + CONFIG_MINIX_FS Minix is a simple operating system used in many classes about OS's. The minix file system (method to organize files on a hard disk diff -urN C12-0/fs/Config.in C12-current/fs/Config.in --- C12-0/fs/Config.in Wed May 1 15:34:58 2002 +++ C12-current/fs/Config.in Wed May 1 15:45:22 2002 @@ -5,6 +5,7 @@ comment 'File systems' bool 'Quota support' CONFIG_QUOTA +tristate 'Filesystem interface to raw devices' CONFIG_RAW tristate 'Kernel automounter support' CONFIG_AUTOFS_FS tristate 'Kernel automounter version 4 support (also supports v3)' CONFIG_AUTOFS4_FS diff -urN C12-0/fs/Makefile C12-current/fs/Makefile --- C12-0/fs/Makefile Wed May 1 15:34:58 2002 +++ C12-current/fs/Makefile Wed May 1 15:45:22 2002 @@ -71,6 +71,7 @@ subdir-$(CONFIG_SUN_OPENPROMFS) += openpromfs subdir-$(CONFIG_JFS_FS) += jfs +obj-$(CONFIG_RAW) += raw.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o diff -urN C12-0/fs/raw.c C12-current/fs/raw.c --- C12-0/fs/raw.c Wed Dec 31 19:00:00 1969 +++ C12-current/fs/raw.c Wed May 1 15:51:43 2002 @@ -0,0 +1,310 @@ +/* + * fs/raw.c + * + * raw devices without a barf-bag + * + * derived from drivers/char/raw.c - actual IO operations are almost exact + * copy, API for controlling that beast replaced with sane one. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/iobuf.h> +#include <linux/blkdev.h> +#include <linux/seq_file.h> + +struct raw_dev { + struct block_device *bdev; + int count; +}; + +static inline struct raw_dev *raw_dev(struct super_block *s) +{ + return s->u.generic_sbp; +} + +static spinlock_t count_lock = SPIN_LOCK_UNLOCKED; + +static int raw_open(struct inode *inode, struct file *filp) +{ + struct super_block *s = inode->i_sb; + struct block_device *bdev = raw_dev(s)->bdev; + int sector_size; + int err; + + if (!filp->f_iobuf) { + err = alloc_kiovec(1, &filp->f_iobuf); + if (err) + return err; + } + + atomic_inc(&bdev->bd_count); + err = blkdev_get(bdev, filp->f_mode, 0, BDEV_RAW); + if (err) + return err; + + sector_size = bdev_hardsect_size(bdev); + + spin_lock(&count_lock); + if (!raw_dev(s)->count++) { + int bits; + + s->s_blocksize = sector_size; + for (bits = 0; !(sector_size & 1); sector_size>>=1, bits++) + ; + s->s_blocksize_bits = bits; + inode->i_size = bdev->bd_inode->i_size; + } + spin_unlock(&count_lock); + return 0; +} + +static int raw_release(struct inode *inode, struct file *filp) +{ + struct raw_dev *p = raw_dev(inode->i_sb); + spin_lock(&count_lock); + p->count--; + spin_unlock(&count_lock); + blkdev_put(p->bdev, BDEV_RAW); + return 0; +} + +static ssize_t rw_raw_dev(int rw, struct file *filp, char *buf, + size_t size, loff_t *offp) +{ + struct super_block *s = filp->f_dentry->d_inode->i_sb; + struct kiobuf *iobuf = filp->f_iobuf; + struct block_device *bdev = raw_dev(s)->bdev; + int sector_size = s->s_blocksize; + int sector_bits = s->s_blocksize_bits; + int sector_mask = sector_size - 1; + unsigned long limit = bdev->bd_inode->i_size >> sector_bits; + sector_t blocknr = *offp >> sector_bits; + size_t transferred = 0; + int new_iobuf = 0; + int err; + + /* + * First, a few checks on device size limits + */ + + if (test_and_set_bit(0, &filp->f_iobuf_lock)) { + /* + * A parallel read/write is using the preallocated iobuf + * so just run slow and allocate a new one. + */ + err = alloc_kiovec(1, &iobuf); + if (err) + goto out; + new_iobuf = 1; + } + + err = -EINVAL; + if ((*offp & sector_mask) || (size & sector_mask)) + goto out_free; + err = 0; + if (size) + err = -ENXIO; + if ((*offp >> sector_bits) >= limit) + goto out_free; + + while (size > 0) { + unsigned long blocks = size >> sector_bits; + int iosize; + + if (blocks > limit - blocknr) + blocks = limit - blocknr; + if (!blocks) + break; + + iosize = blocks << sector_bits; + + err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); + if (err) + break; + + err = brw_kiovec(rw, 1, &iobuf, bdev, &blocknr, sector_size); + + if (rw == READ && err > 0) + mark_dirty_kiobuf(iobuf, err); + + if (err >= 0) { + transferred += err; + size -= err; + buf += err; + } + + blocknr += blocks; + + unmap_kiobuf(iobuf); + + if (err != iosize) + break; + } + + if (transferred) { + *offp += transferred; + err = transferred; + } + +out_free: + if (!new_iobuf) + clear_bit(0, &filp->f_iobuf_lock); + else + free_kiovec(1, &iobuf); +out: + return err; +} + +static ssize_t raw_read(struct file *filp, char * buf, + size_t size, loff_t *offp) +{ + return rw_raw_dev(READ, filp, buf, size, offp); +} + +static ssize_t raw_write(struct file *filp, const char *buf, + size_t size, loff_t *offp) +{ + return rw_raw_dev(WRITE, filp, (char *) buf, size, offp); +} + +static struct file_operations raw_fops = { + open: raw_open, + release: raw_release, + read: raw_read, + write: raw_write, +}; + +static int raw_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + dev_t dev = raw_dev(mnt->mnt_sb)->bdev->bd_dev; + seq_printf(m, " dev=%d:%d", MAJOR(dev), MINOR(dev)); + return 0; +} + +static struct super_operations s_ops = { + statfs: simple_statfs, + show_options: raw_show_options, +}; + +static struct super_block *raw_get_sb(struct file_system_type *fs_type, + int flags, char *dev_name, void *data) +{ + struct inode *inode, *root; + struct super_block *s; + struct nameidata nd; + struct raw_dev *p = kmalloc(sizeof(struct raw_dev), GFP_KERNEL); + int error = -ENOMEM; + + if (!p) + goto out; + + /* sanity check for device name */ + error = -EINVAL; + if (!dev_name || !*dev_name) + goto out1; + + /* find it */ + error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); + if (error) + goto out1; + + /* is it a block device? */ + inode = nd.dentry->d_inode; + error = -ENOTBLK; + if (!S_ISBLK(inode->i_mode)) + goto out2; + + /* do we have it on nodev filesystem? */ + error = -EACCES; + if (nd.mnt->mnt_flags & MNT_NODEV) + goto out2; + + /* get struct block_device */ + error = bd_acquire(inode); + if (error) + goto out2; + + /* allocate superblock */ + s = sget(fs_type, NULL, set_anon_super, NULL); + if (IS_ERR(s)) { + bdput(inode->i_bdev); + path_release(&nd); + kfree(p); + return s; + } + + /* set it up */ + s->s_blocksize = PAGE_CACHE_SIZE; + s->s_blocksize_bits = PAGE_CACHE_SHIFT; + s->s_magic = 0x726177; + s->s_op = &s_ops; + s->u.generic_sbp = p; + p->bdev = inode->i_bdev; + p->count = 0; + + /* allocate (the only) inode */ + error = -ENOMEM; + root = new_inode(s); + if (!root) + goto out3; + + /* set it up */ + root->i_mode = S_IFREG | (inode->i_mode & S_IRWXUGO); + root->i_uid = inode->i_uid; + root->i_gid = inode->i_gid; + root->i_blksize = PAGE_CACHE_SIZE; + root->i_blocks = 0; + root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; + root->i_fop = &raw_fops; + + /* make it root */ + s->s_root = d_alloc_root(root); + if (!s->s_root) + goto out4; + s->s_flags |= MS_ACTIVE; + path_release(&nd); + return s; + +out4: + iput(root); +out3: + up_write(&s->s_umount); + deactivate_super(s); +out2: + path_release(&nd); +out1: + kfree(p); +out: + return ERR_PTR(error); +} + +static void raw_kill_sb(struct super_block *s) +{ + struct raw_dev *p = raw_dev(s); + kill_anon_super(s); + bdput(p->bdev); + kfree(p); +} + +static struct file_system_type raw_fs_type = { + owner: THIS_MODULE, + name: "raw", + get_sb: raw_get_sb, + kill_sb: raw_kill_sb, +}; + +static int __init init_rawfs(void) +{ + return register_filesystem(&raw_fs_type); +} + +static void __exit exit_rawfs(void) +{ + unregister_filesystem(&raw_fs_type); +} + +EXPORT_NO_SYMBOLS; +MODULE_LICENSE("GPL"); /* fair is fair - derived from GPLed code */ +module_init(init_rawfs) +module_exit(exit_rawfs) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/