diff options
Diffstat (limited to 'fs')
141 files changed, 7771 insertions, 5516 deletions
diff --git a/fs/9p/mux.c b/fs/9p/mux.c index f4407eb276c7..12e1baa4508d 100644 --- a/fs/9p/mux.c +++ b/fs/9p/mux.c @@ -712,7 +712,7 @@ static void v9fs_read_work(void *a) * v9fs_send_request - send 9P request * The function can sleep until the request is scheduled for sending. * The function can be interrupted. Return from the function is not - * a guarantee that the request is sent succesfully. Can return errors + * a guarantee that the request is sent successfully. Can return errors * that can be retrieved by PTR_ERR macros. * * @m: mux data diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 2cb87ba4b1c1..5c6bdf82146c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -530,9 +530,6 @@ error: if (vfid) v9fs_fid_destroy(vfid); - if (inode) - iput(inode); - return err; } @@ -1054,6 +1051,9 @@ static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer, int ret; char *link = __getname(); + if (unlikely(!link)) + return -ENOMEM; + if (buflen > PATH_MAX) buflen = PATH_MAX; @@ -1171,9 +1171,6 @@ error: if (vfid) v9fs_fid_destroy(vfid); - if (inode) - iput(inode); - return err; } @@ -1227,6 +1224,9 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, } name = __getname(); + if (unlikely(!name)) + return -ENOMEM; + sprintf(name, "%d\n", oldfid->fid); retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name); __putname(name); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 872943004e59..8b15bb22caca 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -256,11 +256,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt) } static void -v9fs_umount_begin(struct super_block *sb) +v9fs_umount_begin(struct vfsmount *vfsmnt, int flags) { - struct v9fs_session_info *v9ses = sb->s_fs_info; + struct v9fs_session_info *v9ses = vfsmnt->mnt_sb->s_fs_info; - v9fs_session_cancel(v9ses); + if (flags & MNT_FORCE) + v9fs_session_cancel(v9ses); } static struct super_operations v9fs_super_ops = { diff --git a/fs/Kconfig b/fs/Kconfig index 467f7ae5f092..00aa3d5c5a83 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -776,7 +776,8 @@ endmenu menu "Pseudo filesystems" config PROC_FS - bool "/proc file system support" + bool "/proc file system support" if EMBEDDED + default y help This is a virtual file system providing information about the status of the system. "Virtual" means that it doesn't take up any space on @@ -1370,11 +1371,19 @@ config UFS_FS config UFS_FS_WRITE bool "UFS file system write support (DANGEROUS)" - depends on UFS_FS && EXPERIMENTAL && BROKEN + depends on UFS_FS && EXPERIMENTAL help Say Y here if you want to try writing to UFS partitions. This is experimental, so you should back up your UFS partitions beforehand. +config UFS_DEBUG + bool "UFS debugging" + depends on UFS_FS + help + If you are experiencing any problems with the UFS filesystem, say + Y here. This will result in _many_ additional debugging messages to be + written to the system log. + endmenu menu "Network File Systems" @@ -1481,7 +1490,12 @@ config NFSD select LOCKD select SUNRPC select EXPORTFS - select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL + select NFSD_V2_ACL if NFSD_V3_ACL + select NFS_ACL_SUPPORT if NFSD_V2_ACL + select NFSD_TCP if NFSD_V4 + select CRYPTO_MD5 if NFSD_V4 + select CRYPTO if NFSD_V4 + select FS_POSIX_ACL if NFSD_V4 help If you want your Linux box to act as an NFS *server*, so that other computers on your local network which support NFS can access certain @@ -1519,7 +1533,6 @@ config NFSD_V3 config NFSD_V3_ACL bool "Provide server support for the NFSv3 ACL protocol extension" depends on NFSD_V3 - select NFSD_V2_ACL help Implement the NFSv3 ACL protocol extension for manipulating POSIX Access Control Lists on exported file systems. NFS clients should @@ -1529,10 +1542,6 @@ config NFSD_V3_ACL config NFSD_V4 bool "Provide NFSv4 server support (EXPERIMENTAL)" depends on NFSD_V3 && EXPERIMENTAL - select NFSD_TCP - select CRYPTO_MD5 - select CRYPTO - select FS_POSIX_ACL help If you would like to include the NFSv4 server as well as the NFSv2 and NFSv3 servers, say Y here. This feature is experimental, and diff --git a/fs/affs/super.c b/fs/affs/super.c index 8765cba35bb9..5200f4938df0 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -271,6 +271,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) int reserved; unsigned long mount_flags; int tmp_flags; /* fix remount prototype... */ + u8 sig[4]; pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options"); @@ -370,8 +371,9 @@ got_root: printk(KERN_ERR "AFFS: Cannot read boot block\n"); goto out_error; } - chksum = be32_to_cpu(*(__be32 *)boot_bh->b_data); + memcpy(sig, boot_bh->b_data, 4); brelse(boot_bh); + chksum = be32_to_cpu(*(__be32 *)sig); /* Dircache filesystems are compatible with non-dircache ones * when reading. As long as they aren't supported, writing is @@ -420,11 +422,11 @@ got_root: } if (mount_flags & SF_VERBOSE) { - chksum = cpu_to_be32(chksum); - printk(KERN_NOTICE "AFFS: Mounting volume \"%*s\": Type=%.3s\\%c, Blocksize=%d\n", - AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0], + u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0]; + printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n", + len > 31 ? 31 : len, AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1, - (char *)&chksum,((char *)&chksum)[3] + '0',blocksize); + sig, sig[3] + '0', blocksize); } sb->s_flags |= MS_NODEV | MS_NOSUID; diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 009a9ae88d61..bfc1fd22d5b1 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -413,8 +413,7 @@ int afs_server_find_by_peer(const struct rxrpc_peer *peer, /* we found it in the graveyard - resurrect it */ found_dead_server: - list_del(&server->link); - list_add_tail(&server->link, &cell->sv_list); + list_move_tail(&server->link, &cell->sv_list); afs_get_server(server); afs_kafstimod_del_timer(&server->timeout); spin_unlock(&cell->sv_gylock); diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c index 7ac07d0d47b9..f09a794f248e 100644 --- a/fs/afs/kafsasyncd.c +++ b/fs/afs/kafsasyncd.c @@ -136,8 +136,7 @@ static int kafsasyncd(void *arg) if (!list_empty(&kafsasyncd_async_attnq)) { op = list_entry(kafsasyncd_async_attnq.next, struct afs_async_op, link); - list_del(&op->link); - list_add_tail(&op->link, + list_move_tail(&op->link, &kafsasyncd_async_busyq); } @@ -204,8 +203,7 @@ void afs_kafsasyncd_begin_op(struct afs_async_op *op) init_waitqueue_entry(&op->waiter, kafsasyncd_task); add_wait_queue(&op->call->waitq, &op->waiter); - list_del(&op->link); - list_add_tail(&op->link, &kafsasyncd_async_busyq); + list_move_tail(&op->link, &kafsasyncd_async_busyq); spin_unlock(&kafsasyncd_async_lock); @@ -223,8 +221,7 @@ void afs_kafsasyncd_attend_op(struct afs_async_op *op) spin_lock(&kafsasyncd_async_lock); - list_del(&op->link); - list_add_tail(&op->link, &kafsasyncd_async_attnq); + list_move_tail(&op->link, &kafsasyncd_async_attnq); spin_unlock(&kafsasyncd_async_lock); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index b5cf9e1205ad..99785a79d043 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -203,7 +203,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) /* try and do the mount */ kdebug("--- attempting mount %s -o %s ---", devname, options); - mnt = do_kern_mount("afs", 0, devname, options); + mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options); kdebug("--- mount result %p ---", mnt); free_page((unsigned long) devname); diff --git a/fs/afs/server.c b/fs/afs/server.c index 62b093aa41c6..22afaae1a4ce 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -123,8 +123,7 @@ int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr, resurrect_server: _debug("resurrecting server"); - list_del(&zombie->link); - list_add_tail(&zombie->link, &cell->sv_list); + list_move_tail(&zombie->link, &cell->sv_list); afs_get_server(zombie); afs_kafstimod_del_timer(&zombie->timeout); spin_unlock(&cell->sv_gylock); @@ -168,8 +167,7 @@ void afs_put_server(struct afs_server *server) } spin_lock(&cell->sv_gylock); - list_del(&server->link); - list_add_tail(&server->link, &cell->sv_graveyard); + list_move_tail(&server->link, &cell->sv_graveyard); /* time out in 10 secs */ afs_kafstimod_add_timer(&server->timeout, 10 * HZ); diff --git a/fs/afs/super.c b/fs/afs/super.c index 82468df0ba54..67d1f5c819ec 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -48,7 +48,7 @@ static void afs_put_super(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); -static struct file_system_type afs_fs_type = { +struct file_system_type afs_fs_type = { .owner = THIS_MODULE, .name = "afs", .get_sb = afs_get_sb, diff --git a/fs/afs/super.h b/fs/afs/super.h index ac11362f4e95..32de8cc6fae8 100644 --- a/fs/afs/super.h +++ b/fs/afs/super.h @@ -38,6 +38,8 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) return sb->s_fs_info; } +extern struct file_system_type afs_fs_type; + #endif /* __KERNEL__ */ #endif /* _LINUX_AFS_SUPER_H */ diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index eced20618ecc..331f730a1fb3 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -326,8 +326,7 @@ int afs_vlocation_lookup(struct afs_cell *cell, /* found in the graveyard - resurrect */ _debug("found in graveyard"); atomic_inc(&vlocation->usage); - list_del(&vlocation->link); - list_add_tail(&vlocation->link, &cell->vl_list); + list_move_tail(&vlocation->link, &cell->vl_list); spin_unlock(&cell->vl_gylock); afs_kafstimod_del_timer(&vlocation->timeout); @@ -478,8 +477,7 @@ static void __afs_put_vlocation(struct afs_vlocation *vlocation) } /* move to graveyard queue */ - list_del(&vlocation->link); - list_add_tail(&vlocation->link,&cell->vl_graveyard); + list_move_tail(&vlocation->link,&cell->vl_graveyard); /* remove from pending timeout queue (refcounted if actually being * updated) */ diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c index 9867fef3261d..cf62da5d7825 100644 --- a/fs/afs/vnode.c +++ b/fs/afs/vnode.c @@ -104,8 +104,7 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode, vnode->cb_expiry * HZ); spin_lock(&afs_cb_hash_lock); - list_del(&vnode->cb_hash_link); - list_add_tail(&vnode->cb_hash_link, + list_move_tail(&vnode->cb_hash_link, &afs_cb_hash(server, &vnode->fid)); spin_unlock(&afs_cb_hash_lock); @@ -641,7 +641,7 @@ static inline int __queue_kicked_iocb(struct kiocb *iocb) * invoked both for initial i/o submission and * subsequent retries via the aio_kick_handler. * Expects to be invoked with iocb->ki_ctx->lock - * already held. The lock is released and reaquired + * already held. The lock is released and reacquired * as needed during processing. * * Calls the iocb retry method (already setup for the diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index b8ce02607d66..8dbd44f10e9d 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -174,6 +174,12 @@ static int autofs4_tree_busy(struct vfsmount *mnt, struct autofs_info *ino = autofs4_dentry_ino(p); unsigned int ino_count = atomic_read(&ino->count); + /* + * Clean stale dentries below that have not been + * invalidated after a mount fail during lookup + */ + d_invalidate(p); + /* allow for dget above and top is already dgot */ if (p == top) ino_count += 2; @@ -370,8 +376,7 @@ next: DPRINTK("returning %p %.*s", expired, (int)expired->d_name.len, expired->d_name.name); spin_lock(&dcache_lock); - list_del(&expired->d_parent->d_subdirs); - list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child); + list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); spin_unlock(&dcache_lock); return expired; } diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index b1c902e319c1..c94d52eafd1b 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -510,7 +510,7 @@ static int load_flat_file(struct linux_binprm * bprm, } /* OK, This is the point of no return */ - set_personality(PER_LINUX); + set_personality(PER_LINUX_32BIT); } /* diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 07a4996cca3f..34ebbc191e46 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -55,6 +55,7 @@ typedef struct { } Node; static DEFINE_RWLOCK(entries_lock); +static struct file_system_type bm_fs_type; static struct vfsmount *bm_mnt; static int entry_count; @@ -637,7 +638,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, if (!inode) goto out2; - err = simple_pin_fs("binfmt_misc", &bm_mnt, &entry_count); + err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count); if (err) { iput(inode); inode = NULL; diff --git a/fs/buffer.c b/fs/buffer.c index 373bb6292bdc..f23bb647db47 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -564,7 +564,7 @@ still_busy: * Completion handler for block_write_full_page() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. */ -void end_buffer_async_write(struct buffer_head *bh, int uptodate) +static void end_buffer_async_write(struct buffer_head *bh, int uptodate) { char b[BDEVNAME_SIZE]; unsigned long flags; @@ -3166,7 +3166,6 @@ EXPORT_SYMBOL(block_sync_page); EXPORT_SYMBOL(block_truncate_page); EXPORT_SYMBOL(block_write_full_page); EXPORT_SYMBOL(cont_prepare_write); -EXPORT_SYMBOL(end_buffer_async_write); EXPORT_SYMBOL(end_buffer_read_sync); EXPORT_SYMBOL(end_buffer_write_sync); EXPORT_SYMBOL(file_fsync); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f2e285457bee..c28ede599946 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -403,12 +403,14 @@ static struct quotactl_ops cifs_quotactl_ops = { #endif #ifdef CONFIG_CIFS_EXPERIMENTAL -static void cifs_umount_begin(struct super_block * sblock) +static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags) { struct cifs_sb_info *cifs_sb; struct cifsTconInfo * tcon; - cifs_sb = CIFS_SB(sblock); + if (!(flags & MNT_FORCE)) + return; + cifs_sb = CIFS_SB(vfsmnt->mnt_sb); if(cifs_sb == NULL) return; diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 6c6771db36da..7caee8d8ea3b 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -259,7 +259,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, /* If request was not a signal, enqueue and don't free */ if (!(req->uc_flags & REQ_ASYNC)) { req->uc_flags |= REQ_READ; - list_add(&(req->uc_chain), vcp->vc_processing.prev); + list_add_tail(&(req->uc_chain), &vcp->vc_processing); goto out; } diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index b040eba13a7d..a5b5e631ba61 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -725,7 +725,7 @@ static int coda_upcall(struct coda_sb_info *sbi, ((union inputArgs *)buffer)->ih.unique = req->uc_unique; /* Append msg to pending queue and poke Venus. */ - list_add(&(req->uc_chain), vcommp->vc_pending.prev); + list_add_tail(&(req->uc_chain), &vcommp->vc_pending); wake_up_interruptible(&vcommp->vc_waitq); /* We can be interrupted while we wait for Venus to process diff --git a/fs/compat.c b/fs/compat.c index 7e7e5bc4f3cf..e31e9cf96647 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -55,6 +55,20 @@ extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); +int compat_log = 1; + +int compat_printk(const char *fmt, ...) +{ + va_list ap; + int ret; + if (!compat_log) + return 0; + va_start(ap, fmt); + ret = vprintk(fmt, ap); + va_end(ap); + return ret; +} + /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -359,7 +373,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd, sprintf(buf,"'%c'", (cmd>>24) & 0x3f); if (!isprint(buf[1])) sprintf(buf, "%02x", buf[1]); - printk("ioctl32(%s:%d): Unknown cmd fd(%d) " + compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) " "cmd(%08x){%s} arg(%08x) on %s\n", current->comm, current->pid, (int)fd, (unsigned int)cmd, buf, diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index d2c38875ab29..d8ecfedef189 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -80,6 +80,7 @@ #include <net/bluetooth/rfcomm.h> #include <linux/capi.h> +#include <linux/gigaset_dev.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> @@ -205,38 +206,6 @@ static int do_ext3_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg)); } -struct compat_dmx_event { - dmx_event_t event; - compat_time_t timeStamp; - union - { - dmx_scrambling_status_t scrambling; - } u; -}; - -static int do_dmx_get_event(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct dmx_event kevent; - mm_segment_t old_fs = get_fs(); - int err; - - set_fs(KERNEL_DS); - err = sys_ioctl(fd, cmd, (unsigned long) &kevent); - set_fs(old_fs); - - if (!err) { - struct compat_dmx_event __user *up = compat_ptr(arg); - - err = put_user(kevent.event, &up->event); - err |= put_user(kevent.timeStamp, &up->timeStamp); - err |= put_user(kevent.u.scrambling, &up->u.scrambling); - if (err) - err = -EFAULT; - } - - return err; -} - struct compat_video_event { int32_t type; compat_time_t timestamp; @@ -2964,7 +2933,6 @@ HANDLE_IOCTL(NCP_IOC_SETPRIVATEDATA_32, do_ncp_setprivatedata) #endif /* dvb */ -HANDLE_IOCTL(DMX_GET_EVENT, do_dmx_get_event) HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event) HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture) HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 5f952187fc53..207f8006fd6c 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1009,8 +1009,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir /* fallthrough */ default: if (filp->f_pos == 2) { - list_del(q); - list_add(q, &parent_sd->s_children); + list_move(q, &parent_sd->s_children); } for (p=q->next; p!= &parent_sd->s_children; p=p->next) { struct configfs_dirent *next; @@ -1033,8 +1032,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir dt_type(next)) < 0) return 0; - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 94dab7bdd851..3e5fe843e1df 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -118,7 +118,7 @@ static struct file_system_type configfs_fs_type = { int configfs_pin_fs(void) { - return simple_pin_fs("configfs", &configfs_mount, + return simple_pin_fs(&configfs_fs_type, &configfs_mount, &configfs_mnt_count); } diff --git a/fs/dcache.c b/fs/dcache.c index 313b54b2b8f2..48b44a714b35 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -406,7 +406,7 @@ static void prune_dcache(int count, struct super_block *sb) cond_resched_lock(&dcache_lock); tmp = dentry_unused.prev; - if (unlikely(sb)) { + if (sb) { /* Try to find a dentry for this sb, but don't try * too hard, if they aren't near the tail they will * be moved down again soon @@ -522,8 +522,7 @@ void shrink_dcache_sb(struct super_block * sb) dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb) continue; - list_del(tmp); - list_add(tmp, &dentry_unused); + list_move(tmp, &dentry_unused); } /* @@ -638,7 +637,7 @@ resume: * of the unused list for prune_dcache */ if (!atomic_read(&dentry->d_count)) { - list_add(&dentry->d_lru, dentry_unused.prev); + list_add_tail(&dentry->d_lru, &dentry_unused); dentry_stat.nr_unused++; found++; } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 440128ebef3b..6fa1e04f8415 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -199,7 +199,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode, pr_debug("debugfs: creating file '%s'\n",name); - error = simple_pin_fs("debugfs", &debugfs_mount, &debugfs_mount_count); + error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); if (error) goto exit; diff --git a/fs/dquot.c b/fs/dquot.c index 81d87a413c68..0122a279106a 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -250,7 +250,7 @@ static inline struct dquot *find_dquot(unsigned int hashent, struct super_block /* Add a dquot to the tail of the free list */ static inline void put_dquot_last(struct dquot *dquot) { - list_add(&dquot->dq_free, free_dquots.prev); + list_add_tail(&dquot->dq_free, &free_dquots); dqstats.free_dquots++; } @@ -266,7 +266,7 @@ static inline void put_inuse(struct dquot *dquot) { /* We add to the back of inuse list so we don't have to restart * when traversing this list and we block */ - list_add(&dquot->dq_inuse, inuse_list.prev); + list_add_tail(&dquot->dq_inuse, &inuse_list); dqstats.allocated_dquots++; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 08e7e6a555ca..9c677bbd0b08 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1,6 +1,6 @@ /* * fs/eventpoll.c ( Efficent event polling implementation ) - * Copyright (C) 2001,...,2003 Davide Libenzi + * Copyright (C) 2001,...,2006 Davide Libenzi * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -1004,7 +1004,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) - wake_up(&ep->wq); + __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE); if (waitqueue_active(&ep->poll_wait)) pwake++; } @@ -1083,7 +1083,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) - wake_up(&ep->wq); + __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | + TASK_INTERRUPTIBLE); if (waitqueue_active(&ep->poll_wait)) pwake++; } @@ -1260,7 +1261,8 @@ is_linked: * wait list. */ if (waitqueue_active(&ep->wq)) - wake_up(&ep->wq); + __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | + TASK_INTERRUPTIBLE); if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1444,7 +1446,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) * wait list. */ if (waitqueue_active(&ep->wq)) - wake_up(&ep->wq); + __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | + TASK_INTERRUPTIBLE); if (waitqueue_active(&ep->poll_wait)) pwake++; } @@ -1516,7 +1519,7 @@ retry: * ep_poll_callback() when events will become available. */ init_waitqueue_entry(&wait, current); - add_wait_queue(&ep->wq, &wait); + __add_wait_queue(&ep->wq, &wait); for (;;) { /* @@ -1536,7 +1539,7 @@ retry: jtimeout = schedule_timeout(jtimeout); write_lock_irqsave(&ep->lock, flags); } - remove_wait_queue(&ep->wq, &wait); + __remove_wait_queue(&ep->wq, &wait); set_current_state(TASK_RUNNING); } diff --git a/fs/exec.c b/fs/exec.c index 0b88bf646143..c8494f513eaf 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(current)) { - struct dentry *proc_dentry1, *proc_dentry2; - /* * Wait for the thread group leader to be a zombie. * It should already be zombie at this point, most @@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk) */ current->start_time = leader->start_time; - spin_lock(&leader->proc_lock); - spin_lock(¤t->proc_lock); - proc_dentry1 = proc_pid_unhash(current); - proc_dentry2 = proc_pid_unhash(leader); write_lock_irq(&tasklist_lock); BUG_ON(leader->tgid != current->tgid); @@ -713,7 +707,7 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_PID, current->pid); attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); attach_pid(current, PIDTYPE_SID, current->signal->session); - list_add_tail_rcu(¤t->tasks, &init_task.tasks); + list_replace_rcu(&leader->tasks, ¤t->tasks); current->group_leader = current; leader->group_leader = current; @@ -721,7 +715,6 @@ static int de_thread(struct task_struct *tsk) /* Reduce leader to a thread */ detach_pid(leader, PIDTYPE_PGID); detach_pid(leader, PIDTYPE_SID); - list_del_init(&leader->tasks); current->exit_signal = SIGCHLD; @@ -729,10 +722,6 @@ static int de_thread(struct task_struct *tsk) leader->exit_state = EXIT_DEAD; write_unlock_irq(&tasklist_lock); - spin_unlock(&leader->proc_lock); - spin_unlock(¤t->proc_lock); - proc_pid_flush(proc_dentry1); - proc_pid_flush(proc_dentry2); } /* @@ -1379,67 +1368,102 @@ static void format_corename(char *corename, const char *pattern, long signr) *out_ptr = 0; } -static void zap_threads (struct mm_struct *mm) +static void zap_process(struct task_struct *start) { - struct task_struct *g, *p; - struct task_struct *tsk = current; - struct completion *vfork_done = tsk->vfork_done; - int traced = 0; + struct task_struct *t; - /* - * Make sure nobody is waiting for us to release the VM, - * otherwise we can deadlock when we wait on each other - */ - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } + start->signal->flags = SIGNAL_GROUP_EXIT; + start->signal->group_stop_count = 0; - read_lock(&tasklist_lock); - do_each_thread(g,p) - if (mm == p->mm && p != tsk) { - force_sig_specific(SIGKILL, p); - mm->core_waiters++; - if (unlikely(p->ptrace) && - unlikely(p->parent->mm == mm)) - traced = 1; + t = start; + do { + if (t != current && t->mm) { + t->mm->core_waiters++; + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); } - while_each_thread(g,p); + } while ((t = next_thread(t)) != start); +} - read_unlock(&tasklist_lock); +static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + int exit_code) +{ + struct task_struct *g, *p; + unsigned long flags; + int err = -EAGAIN; + + spin_lock_irq(&tsk->sighand->siglock); + if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) { + tsk->signal->group_exit_code = exit_code; + zap_process(tsk); + err = 0; + } + spin_unlock_irq(&tsk->sighand->siglock); + if (err) + return err; - if (unlikely(traced)) { - /* - * We are zapping a thread and the thread it ptraces. - * If the tracee went into a ptrace stop for exit tracing, - * we could deadlock since the tracer is waiting for this - * coredump to finish. Detach them so they can both die. - */ - write_lock_irq(&tasklist_lock); - do_each_thread(g,p) { - if (mm == p->mm && p != tsk && - p->ptrace && p->parent->mm == mm) { - __ptrace_detach(p, 0); + if (atomic_read(&mm->mm_users) == mm->core_waiters + 1) + goto done; + + rcu_read_lock(); + for_each_process(g) { + if (g == tsk->group_leader) + continue; + + p = g; + do { + if (p->mm) { + if (p->mm == mm) { + /* + * p->sighand can't disappear, but + * may be changed by de_thread() + */ + lock_task_sighand(p, &flags); + zap_process(p); + unlock_task_sighand(p, &flags); + } + break; } - } while_each_thread(g,p); - write_unlock_irq(&tasklist_lock); + } while ((p = next_thread(p)) != g); } + rcu_read_unlock(); +done: + return mm->core_waiters; } -static void coredump_wait(struct mm_struct *mm) +static int coredump_wait(int exit_code) { - DECLARE_COMPLETION(startup_done); + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + struct completion startup_done; + struct completion *vfork_done; int core_waiters; + init_completion(&mm->core_done); + init_completion(&startup_done); mm->core_startup_done = &startup_done; - zap_threads(mm); - core_waiters = mm->core_waiters; + core_waiters = zap_threads(tsk, mm, exit_code); up_write(&mm->mmap_sem); + if (unlikely(core_waiters < 0)) + goto fail; + + /* + * Make sure nobody is waiting for us to release the VM, + * otherwise we can deadlock when we wait on each other + */ + vfork_done = tsk->vfork_done; + if (vfork_done) { + tsk->vfork_done = NULL; + complete(vfork_done); + } + if (core_waiters) wait_for_completion(&startup_done); +fail: BUG_ON(mm->core_waiters); + return core_waiters; } int do_coredump(long signr, int exit_code, struct pt_regs * regs) @@ -1473,22 +1497,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) } mm->dumpable = 0; - retval = -EAGAIN; - spin_lock_irq(¤t->sighand->siglock); - if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) { - current->signal->flags = SIGNAL_GROUP_EXIT; - current->signal->group_exit_code = exit_code; - current->signal->group_stop_count = 0; - retval = 0; - } - spin_unlock_irq(¤t->sighand->siglock); - if (retval) { - up_write(&mm->mmap_sem); + retval = coredump_wait(exit_code); + if (retval < 0) goto fail; - } - - init_completion(&mm->core_done); - coredump_wait(mm); /* * Clear any false indication of pending signals that might diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile index c5d02da73bc3..e0b2b43c1fdb 100644 --- a/fs/ext2/Makefile +++ b/fs/ext2/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_EXT2_FS) += ext2.o -ext2-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 2c00953d4b0b..433a213a8bd9 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -521,6 +521,26 @@ io_error: goto out_release; } +#ifdef EXT2FS_DEBUG + +static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + +unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars) +{ + unsigned int i; + unsigned long sum = 0; + + if (!map) + return (0); + for (i = 0; i < numchars; i++) + sum += nibblemap[map->b_data[i] & 0xf] + + nibblemap[(map->b_data[i] >> 4) & 0xf]; + return (sum); +} + +#endif /* EXT2FS_DEBUG */ + +/* Superblock must be locked */ unsigned long ext2_count_free_blocks (struct super_block * sb) { struct ext2_group_desc * desc; @@ -530,7 +550,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb) unsigned long bitmap_count, x; struct ext2_super_block *es; - lock_super (sb); es = EXT2_SB(sb)->s_es; desc_count = 0; bitmap_count = 0; @@ -554,7 +573,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb) printk("ext2_count_free_blocks: stored = %lu, computed = %lu, %lu\n", (long)le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); - unlock_super (sb); return bitmap_count; #else for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { diff --git a/fs/ext2/bitmap.c b/fs/ext2/bitmap.c deleted file mode 100644 index e9983a0dd396..000000000000 --- a/fs/ext2/bitmap.c +++ /dev/null @@ -1,32 +0,0 @@ -/* - * linux/fs/ext2/bitmap.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - */ - -#ifdef EXT2FS_DEBUG - -#include <linux/buffer_head.h> - -#include "ext2.h" - -static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; - -unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars) -{ - unsigned int i; - unsigned long sum = 0; - - if (!map) - return (0); - for (i = 0; i < numchars; i++) - sum += nibblemap[map->b_data[i] & 0xf] + - nibblemap[(map->b_data[i] >> 4) & 0xf]; - return (sum); -} - -#endif /* EXT2FS_DEBUG */ - diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 3c1c9aaaca6b..92ea8265d7d5 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -399,8 +399,7 @@ ino_t ext2_inode_by_name(struct inode * dir, struct dentry *dentry) de = ext2_find_entry (dir, dentry, &page); if (de) { res = le32_to_cpu(de->inode); - kunmap(page); - page_cache_release(page); + ext2_put_page(page); } return res; } diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c index c9c2e5ffa48e..7806b9e8155b 100644 --- a/fs/ext2/fsync.c +++ b/fs/ext2/fsync.c @@ -24,7 +24,7 @@ #include "ext2.h" #include <linux/smp_lock.h> -#include <linux/buffer_head.h> /* for fsync_inode_buffers() */ +#include <linux/buffer_head.h> /* for sync_mapping_buffers() */ /* diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index e52765219e16..308c252568c6 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -638,6 +638,7 @@ fail: return ERR_PTR(err); } +/* Superblock must be locked */ unsigned long ext2_count_free_inodes (struct super_block * sb) { struct ext2_group_desc *desc; @@ -649,7 +650,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb) unsigned long bitmap_count = 0; struct buffer_head *bitmap_bh = NULL; - lock_super (sb); es = EXT2_SB(sb)->s_es; for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { unsigned x; @@ -672,7 +672,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb) printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n", percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter), desc_count, bitmap_count); - unlock_super(sb); return desc_count; #else for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index ee4ba759581e..d4233b2e6436 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -854,7 +854,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } if (!ext2_check_descriptors (sb)) { printk ("EXT2-fs: group descriptors corrupted!\n"); - db_count = i; goto failed_mount2; } sbi->s_gdb_count = db_count; @@ -1046,6 +1045,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) unsigned long overhead; int i; + lock_super(sb); if (test_opt (sb, MINIX_DF)) overhead = 0; else { @@ -1086,6 +1086,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) buf->f_files = le32_to_cpu(sbi->s_es->s_inodes_count); buf->f_ffree = ext2_count_free_inodes (sb); buf->f_namelen = EXT2_NAME_LEN; + unlock_super(sb); return 0; } diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 77927d6938f6..96172e89ddc3 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -163,20 +163,19 @@ restart: #endif static int -goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal, +goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal, unsigned int group, struct super_block * sb) { - unsigned long group_first_block, group_last_block; + ext3_fsblk_t group_first_block, group_last_block; - group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + - group * EXT3_BLOCKS_PER_GROUP(sb); + group_first_block = ext3_group_first_block_no(sb, group); group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1; if ((rsv->_rsv_start > group_last_block) || (rsv->_rsv_end < group_first_block)) return 0; - if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start) - || (goal + group_first_block > rsv->_rsv_end))) + if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start) + || (grp_goal + group_first_block > rsv->_rsv_end))) return 0; return 1; } @@ -187,7 +186,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal, * Returns NULL if there are no windows or if all windows start after the goal. */ static struct ext3_reserve_window_node * -search_reserve_window(struct rb_root *root, unsigned long goal) +search_reserve_window(struct rb_root *root, ext3_fsblk_t goal) { struct rb_node *n = root->rb_node; struct ext3_reserve_window_node *rsv; @@ -223,7 +222,7 @@ void ext3_rsv_window_add(struct super_block *sb, { struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root; struct rb_node *node = &rsv->rsv_node; - unsigned int start = rsv->rsv_start; + ext3_fsblk_t start = rsv->rsv_start; struct rb_node ** p = &root->rb_node; struct rb_node * parent = NULL; @@ -310,20 +309,20 @@ void ext3_discard_reservation(struct inode *inode) /* Free given blocks, update quota and i_blocks field */ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb, - unsigned long block, unsigned long count, - int *pdquot_freed_blocks) + ext3_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks) { struct buffer_head *bitmap_bh = NULL; struct buffer_head *gd_bh; unsigned long block_group; - unsigned long bit; + ext3_grpblk_t bit; unsigned long i; unsigned long overflow; struct ext3_group_desc * desc; struct ext3_super_block * es; struct ext3_sb_info *sbi; int err = 0, ret; - unsigned group_freed; + ext3_grpblk_t group_freed; *pdquot_freed_blocks = 0; sbi = EXT3_SB(sb); @@ -333,7 +332,7 @@ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb, block + count > le32_to_cpu(es->s_blocks_count)) { ext3_error (sb, "ext3_free_blocks", "Freeing blocks not in datazone - " - "block = %lu, count = %lu", block, count); + "block = "E3FSBLK", count = %lu", block, count); goto error_return; } @@ -369,7 +368,7 @@ do_more: sbi->s_itb_per_group)) ext3_error (sb, "ext3_free_blocks", "Freeing blocks in system zones - " - "Block = %lu, count = %lu", + "Block = "E3FSBLK", count = %lu", block, count); /* @@ -453,7 +452,8 @@ do_more: bit + i, bitmap_bh->b_data)) { jbd_unlock_bh_state(bitmap_bh); ext3_error(sb, __FUNCTION__, - "bit already cleared for block %lu", block + i); + "bit already cleared for block "E3FSBLK, + block + i); jbd_lock_bh_state(bitmap_bh); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { @@ -493,10 +493,10 @@ error_return: /* Free given blocks, update quota and i_blocks field */ void ext3_free_blocks(handle_t *handle, struct inode *inode, - unsigned long block, unsigned long count) + ext3_fsblk_t block, unsigned long count) { struct super_block * sb; - int dquot_freed_blocks; + unsigned long dquot_freed_blocks; sb = inode->i_sb; if (!sb) { @@ -525,7 +525,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode, * data-writes at some point, and disable it for metadata allocations or * sync-data inodes. */ -static int ext3_test_allocatable(int nr, struct buffer_head *bh) +static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh) { int ret; struct journal_head *jh = bh2jh(bh); @@ -542,11 +542,11 @@ static int ext3_test_allocatable(int nr, struct buffer_head *bh) return ret; } -static int -bitmap_search_next_usable_block(int start, struct buffer_head *bh, - int maxblocks) +static ext3_grpblk_t +bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, + ext3_grpblk_t maxblocks) { - int next; + ext3_grpblk_t next; struct journal_head *jh = bh2jh(bh); /* @@ -576,10 +576,11 @@ bitmap_search_next_usable_block(int start, struct buffer_head *bh, * the initial goal; then for a free byte somewhere in the bitmap; then * for any free bit in the bitmap. */ -static int -find_next_usable_block(int start, struct buffer_head *bh, int maxblocks) +static ext3_grpblk_t +find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, + ext3_grpblk_t maxblocks) { - int here, next; + ext3_grpblk_t here, next; char *p, *r; if (start > 0) { @@ -591,7 +592,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks) * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the * next 64-bit boundary is simple.. */ - int end_goal = (start + 63) & ~63; + ext3_grpblk_t end_goal = (start + 63) & ~63; if (end_goal > maxblocks) end_goal = maxblocks; here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); @@ -628,7 +629,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks) * zero (failure). */ static inline int -claim_block(spinlock_t *lock, int block, struct buffer_head *bh) +claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh) { struct journal_head *jh = bh2jh(bh); int ret; @@ -651,19 +652,18 @@ claim_block(spinlock_t *lock, int block, struct buffer_head *bh) * new bitmap. In that case we must release write access to the old one via * ext3_journal_release_buffer(), else we'll run out of credits. */ -static int +static ext3_grpblk_t ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, - struct buffer_head *bitmap_bh, int goal, + struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal, unsigned long *count, struct ext3_reserve_window *my_rsv) { - int group_first_block, start, end; + ext3_fsblk_t group_first_block; + ext3_grpblk_t start, end; unsigned long num = 0; /* we do allocation within the reservation window if we have a window */ if (my_rsv) { - group_first_block = - le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + - group * EXT3_BLOCKS_PER_GROUP(sb); + group_first_block = ext3_group_first_block_no(sb, group); if (my_rsv->_rsv_start >= group_first_block) start = my_rsv->_rsv_start - group_first_block; else @@ -673,13 +673,13 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, if (end > EXT3_BLOCKS_PER_GROUP(sb)) /* reservation window crosses group boundary */ end = EXT3_BLOCKS_PER_GROUP(sb); - if ((start <= goal) && (goal < end)) - start = goal; + if ((start <= grp_goal) && (grp_goal < end)) + start = grp_goal; else - goal = -1; + grp_goal = -1; } else { - if (goal > 0) - start = goal; + if (grp_goal > 0) + start = grp_goal; else start = 0; end = EXT3_BLOCKS_PER_GROUP(sb); @@ -688,43 +688,43 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb)); repeat: - if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) { - goal = find_next_usable_block(start, bitmap_bh, end); - if (goal < 0) + if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) { + grp_goal = find_next_usable_block(start, bitmap_bh, end); + if (grp_goal < 0) goto fail_access; if (!my_rsv) { int i; - for (i = 0; i < 7 && goal > start && - ext3_test_allocatable(goal - 1, + for (i = 0; i < 7 && grp_goal > start && + ext3_test_allocatable(grp_goal - 1, bitmap_bh); - i++, goal--) + i++, grp_goal--) ; } } - start = goal; + start = grp_goal; - if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) { + if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) { /* * The block was allocated by another thread, or it was * allocated and then freed by another thread */ start++; - goal++; + grp_goal++; if (start >= end) goto fail_access; goto repeat; } num++; - goal++; - while (num < *count && goal < end - && ext3_test_allocatable(goal, bitmap_bh) - && claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) { + grp_goal++; + while (num < *count && grp_goal < end + && ext3_test_allocatable(grp_goal, bitmap_bh) + && claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) { num++; - goal++; + grp_goal++; } *count = num; - return goal - num; + return grp_goal - num; fail_access: *count = num; return -1; @@ -766,12 +766,13 @@ fail_access: static int find_next_reservable_window( struct ext3_reserve_window_node *search_head, struct ext3_reserve_window_node *my_rsv, - struct super_block * sb, int start_block, - int last_block) + struct super_block * sb, + ext3_fsblk_t start_block, + ext3_fsblk_t last_block) { struct rb_node *next; struct ext3_reserve_window_node *rsv, *prev; - int cur; + ext3_fsblk_t cur; int size = my_rsv->rsv_goal_size; /* TODO: make the start of the reservation window byte-aligned */ @@ -873,10 +874,10 @@ static int find_next_reservable_window( * * @rsv: the reservation * - * @goal: The goal (group-relative). It is where the search for a + * @grp_goal: The goal (group-relative). It is where the search for a * free reservable space should start from. - * if we have a goal(goal >0 ), then start from there, - * no goal(goal = -1), we start from the first block + * if we have a grp_goal(grp_goal >0 ), then start from there, + * no grp_goal(grp_goal = -1), we start from the first block * of the group. * * @sb: the super block @@ -885,25 +886,24 @@ static int find_next_reservable_window( * */ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, - int goal, struct super_block *sb, + ext3_grpblk_t grp_goal, struct super_block *sb, unsigned int group, struct buffer_head *bitmap_bh) { struct ext3_reserve_window_node *search_head; - int group_first_block, group_end_block, start_block; - int first_free_block; + ext3_fsblk_t group_first_block, group_end_block, start_block; + ext3_grpblk_t first_free_block; struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; unsigned long size; int ret; spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; - group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + - group * EXT3_BLOCKS_PER_GROUP(sb); + group_first_block = ext3_group_first_block_no(sb, group); group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1; - if (goal < 0) + if (grp_goal < 0) start_block = group_first_block; else - start_block = goal + group_first_block; + start_block = grp_goal + group_first_block; size = my_rsv->rsv_goal_size; @@ -1057,14 +1057,15 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv, * sorted double linked list should be fast. * */ -static int +static ext3_grpblk_t ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, unsigned int group, struct buffer_head *bitmap_bh, - int goal, struct ext3_reserve_window_node * my_rsv, + ext3_grpblk_t grp_goal, + struct ext3_reserve_window_node * my_rsv, unsigned long *count, int *errp) { - unsigned long group_first_block; - int ret = 0; + ext3_fsblk_t group_first_block; + ext3_grpblk_t ret = 0; int fatal; unsigned long num = *count; @@ -1090,17 +1091,16 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, */ if (my_rsv == NULL ) { ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, - goal, count, NULL); + grp_goal, count, NULL); goto out; } /* - * goal is a group relative block number (if there is a goal) - * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb) + * grp_goal is a group relative block number (if there is a goal) + * 0 < grp_goal < EXT3_BLOCKS_PER_GROUP(sb) * first block is a filesystem wide block number * first block is the block number of the first block in this group */ - group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + - group * EXT3_BLOCKS_PER_GROUP(sb); + group_first_block = ext3_group_first_block_no(sb, group); /* * Basically we will allocate a new block from inode's reservation @@ -1119,24 +1119,24 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, */ while (1) { if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || - !goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) { + !goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) { if (my_rsv->rsv_goal_size < *count) my_rsv->rsv_goal_size = *count; - ret = alloc_new_reservation(my_rsv, goal, sb, + ret = alloc_new_reservation(my_rsv, grp_goal, sb, group, bitmap_bh); if (ret < 0) break; /* failed */ - if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) - goal = -1; - } else if (goal > 0 && (my_rsv->rsv_end-goal+1) < *count) + if (!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) + grp_goal = -1; + } else if (grp_goal > 0 && (my_rsv->rsv_end-grp_goal+1) < *count) try_to_extend_reservation(my_rsv, sb, - *count-my_rsv->rsv_end + goal - 1); + *count-my_rsv->rsv_end + grp_goal - 1); if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) || (my_rsv->rsv_end < group_first_block)) BUG(); - ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, + ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, grp_goal, &num, &my_rsv->rsv_window); if (ret >= 0) { my_rsv->rsv_alloc_hit += num; @@ -1164,7 +1164,7 @@ out: static int ext3_has_free_blocks(struct ext3_sb_info *sbi) { - int free_blocks, root_blocks; + ext3_fsblk_t free_blocks, root_blocks; free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); @@ -1200,19 +1200,20 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries) * bitmap, and then for any free bit if that fails. * This function also updates quota and i_blocks field. */ -int ext3_new_blocks(handle_t *handle, struct inode *inode, - unsigned long goal, unsigned long *count, int *errp) +ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, unsigned long *count, int *errp) { struct buffer_head *bitmap_bh = NULL; struct buffer_head *gdp_bh; int group_no; int goal_group; - int ret_block; + ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */ + ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ + ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */ int bgi; /* blockgroup iteration index */ - int target_block; int fatal = 0, err; int performed_allocation = 0; - int free_blocks; + ext3_grpblk_t free_blocks; /* number of free blocks in a group */ struct super_block *sb; struct ext3_group_desc *gdp; struct ext3_super_block *es; @@ -1285,16 +1286,17 @@ retry: my_rsv = NULL; if (free_blocks > 0) { - ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) % + grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % EXT3_BLOCKS_PER_GROUP(sb)); bitmap_bh = read_block_bitmap(sb, group_no); if (!bitmap_bh) goto io_error; - ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no, - bitmap_bh, ret_block, my_rsv, &num, &fatal); + grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle, + group_no, bitmap_bh, grp_target_blk, + my_rsv, &num, &fatal); if (fatal) goto out; - if (ret_block >= 0) + if (grp_alloc_blk >= 0) goto allocated; } @@ -1327,11 +1329,15 @@ retry: bitmap_bh = read_block_bitmap(sb, group_no); if (!bitmap_bh) goto io_error; - ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no, - bitmap_bh, -1, my_rsv, &num, &fatal); + /* + * try to allocate block(s) from this group, without a goal(-1). + */ + grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle, + group_no, bitmap_bh, -1, my_rsv, + &num, &fatal); if (fatal) goto out; - if (ret_block >= 0) + if (grp_alloc_blk >= 0) goto allocated; } /* @@ -1360,18 +1366,18 @@ allocated: if (fatal) goto out; - target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb) - + le32_to_cpu(es->s_first_data_block); + ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no); - if (in_range(le32_to_cpu(gdp->bg_block_bitmap), target_block, num) || - in_range(le32_to_cpu(gdp->bg_inode_bitmap), target_block, num) || - in_range(target_block, le32_to_cpu(gdp->bg_inode_table), + if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) || + in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) || + in_range(ret_block, le32_to_cpu(gdp->bg_inode_table), EXT3_SB(sb)->s_itb_per_group) || - in_range(target_block + num - 1, le32_to_cpu(gdp->bg_inode_table), + in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table), EXT3_SB(sb)->s_itb_per_group)) ext3_error(sb, "ext3_new_block", "Allocating block in system zone - " - "blocks from %u, length %lu", target_block, num); + "blocks from "E3FSBLK", length %lu", + ret_block, num); performed_allocation = 1; @@ -1380,7 +1386,7 @@ allocated: struct buffer_head *debug_bh; /* Record bitmap buffer state in the newly allocated block */ - debug_bh = sb_find_get_block(sb, target_block); + debug_bh = sb_find_get_block(sb, ret_block); if (debug_bh) { BUFFER_TRACE(debug_bh, "state when allocated"); BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); @@ -1393,24 +1399,21 @@ allocated: int i; for (i = 0; i < num; i++) { - if (ext3_test_bit(ret_block, + if (ext3_test_bit(grp_alloc_blk+i, bh2jh(bitmap_bh)->b_committed_data)) { printk("%s: block was unexpectedly set in " "b_committed_data\n", __FUNCTION__); } } } - ext3_debug("found bit %d\n", ret_block); + ext3_debug("found bit %d\n", grp_alloc_blk); spin_unlock(sb_bgl_lock(sbi, group_no)); jbd_unlock_bh_state(bitmap_bh); #endif - /* ret_block was blockgroup-relative. Now it becomes fs-relative */ - ret_block = target_block; - if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) { ext3_error(sb, "ext3_new_block", - "block(%d) >= blocks count(%d) - " + "block("E3FSBLK") >= blocks count(%d) - " "block_group = %d, es == %p ", ret_block, le32_to_cpu(es->s_blocks_count), group_no, es); goto out; @@ -1421,7 +1424,7 @@ allocated: * list of some description. We don't know in advance whether * the caller wants to use it as metadata or data. */ - ext3_debug("allocating block %d. Goal hits %d of %d.\n", + ext3_debug("allocating block %lu. Goal hits %d of %d.\n", ret_block, goal_hits, goal_attempts); spin_lock(sb_bgl_lock(sbi, group_no)); @@ -1461,23 +1464,24 @@ out: return 0; } -int ext3_new_block(handle_t *handle, struct inode *inode, - unsigned long goal, int *errp) +ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, + ext3_fsblk_t goal, int *errp) { unsigned long count = 1; return ext3_new_blocks(handle, inode, goal, &count, errp); } -unsigned long ext3_count_free_blocks(struct super_block *sb) +ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb) { - unsigned long desc_count; + ext3_fsblk_t desc_count; struct ext3_group_desc *gdp; int i; unsigned long ngroups = EXT3_SB(sb)->s_groups_count; #ifdef EXT3FS_DEBUG struct ext3_super_block *es; - unsigned long bitmap_count, x; + ext3_fsblk_t bitmap_count; + unsigned long x; struct buffer_head *bitmap_bh = NULL; es = EXT3_SB(sb)->s_es; @@ -1502,8 +1506,10 @@ unsigned long ext3_count_free_blocks(struct super_block *sb) bitmap_count += x; } brelse(bitmap_bh); - printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n", - le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); + printk("ext3_count_free_blocks: stored = "E3FSBLK + ", computed = "E3FSBLK", "E3FSBLK"\n", + le32_to_cpu(es->s_free_blocks_count), + desc_count, bitmap_count); return bitmap_count; #else desc_count = 0; @@ -1520,7 +1526,7 @@ unsigned long ext3_count_free_blocks(struct super_block *sb) } static inline int -block_in_use(unsigned long block, struct super_block *sb, unsigned char *map) +block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map) { return ext3_test_bit ((block - le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) % diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index dc826464f313..36546ed36a14 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -262,9 +262,11 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int ngroups = sbi->s_groups_count; int inodes_per_group = EXT3_INODES_PER_GROUP(sb); int freei, avefreei; - int freeb, avefreeb; - int blocks_per_dir, ndirs; - int max_debt, max_dirs, min_blocks, min_inodes; + ext3_fsblk_t freeb, avefreeb; + ext3_fsblk_t blocks_per_dir; + int ndirs; + int max_debt, max_dirs, min_inodes; + ext3_grpblk_t min_blocks; int group = -1, i; struct ext3_group_desc *desc; struct buffer_head *bh; @@ -307,7 +309,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) min_inodes = avefreei - inodes_per_group / 4; min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4; - max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST); + max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST); if (max_debt * INODE_COST > inodes_per_group) max_debt = inodes_per_group / INODE_COST; if (max_debt > 255) diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2edd7eec88fd..0321e1b9034a 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -62,7 +62,7 @@ static int ext3_inode_is_fast_symlink(struct inode *inode) * still needs to be revoked. */ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, int blocknr) + struct buffer_head *bh, ext3_fsblk_t blocknr) { int err; @@ -407,13 +407,13 @@ no_block: * * Caller must make sure that @ind is valid and will stay that way. */ -static unsigned long ext3_find_near(struct inode *inode, Indirect *ind) +static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) { struct ext3_inode_info *ei = EXT3_I(inode); __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; __le32 *p; - unsigned long bg_start; - unsigned long colour; + ext3_fsblk_t bg_start; + ext3_grpblk_t colour; /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { @@ -429,8 +429,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind) * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ - bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + - le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group); colour = (current->pid % 16) * (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); return bg_start + colour; @@ -448,7 +447,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind) * stores it in *@goal and returns zero. */ -static unsigned long ext3_find_goal(struct inode *inode, long block, +static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, Indirect chain[4], Indirect *partial) { struct ext3_block_alloc_info *block_i; @@ -516,13 +515,13 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks, * direct blocks */ static int ext3_alloc_blocks(handle_t *handle, struct inode *inode, - unsigned long goal, int indirect_blks, int blks, - unsigned long long new_blocks[4], int *err) + ext3_fsblk_t goal, int indirect_blks, int blks, + ext3_fsblk_t new_blocks[4], int *err) { int target, i; unsigned long count = 0; int index = 0; - unsigned long current_block = 0; + ext3_fsblk_t current_block = 0; int ret = 0; /* @@ -592,7 +591,7 @@ failed_out: * as described above and return 0. */ static int ext3_alloc_branch(handle_t *handle, struct inode *inode, - int indirect_blks, int *blks, unsigned long goal, + int indirect_blks, int *blks, ext3_fsblk_t goal, int *offsets, Indirect *branch) { int blocksize = inode->i_sb->s_blocksize; @@ -600,8 +599,8 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode, int err = 0; struct buffer_head *bh; int num; - unsigned long long new_blocks[4]; - unsigned long long current_block; + ext3_fsblk_t new_blocks[4]; + ext3_fsblk_t current_block; num = ext3_alloc_blocks(handle, inode, goal, indirect_blks, *blks, new_blocks, &err); @@ -688,7 +687,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, int i; int err = 0; struct ext3_block_alloc_info *block_i; - unsigned long current_block; + ext3_fsblk_t current_block; block_i = EXT3_I(inode)->i_block_alloc_info; /* @@ -795,13 +794,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, int offsets[4]; Indirect chain[4]; Indirect *partial; - unsigned long goal; + ext3_fsblk_t goal; int indirect_blks; int blocks_to_boundary = 0; int depth; struct ext3_inode_info *ei = EXT3_I(inode); int count = 0; - unsigned long first_block = 0; + ext3_fsblk_t first_block = 0; J_ASSERT(handle != NULL || create == 0); @@ -819,7 +818,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, count++; /*map more blocks*/ while (count < maxblocks && count <= blocks_to_boundary) { - unsigned long blk; + ext3_fsblk_t blk; if (!verify_chain(chain, partial)) { /* @@ -1759,7 +1758,7 @@ void ext3_set_aops(struct inode *inode) static int ext3_block_truncate_page(handle_t *handle, struct page *page, struct address_space *mapping, loff_t from) { - unsigned long index = from >> PAGE_CACHE_SHIFT; + ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); unsigned blocksize, iblock, length, pos; struct inode *inode = mapping->host; @@ -1960,7 +1959,7 @@ no_top: * than `count' because there can be holes in there. */ static void ext3_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, unsigned long block_to_free, + struct buffer_head *bh, ext3_fsblk_t block_to_free, unsigned long count, __le32 *first, __le32 *last) { __le32 *p; @@ -2022,12 +2021,12 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, struct buffer_head *this_bh, __le32 *first, __le32 *last) { - unsigned long block_to_free = 0; /* Starting block # of a run */ + ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */ unsigned long count = 0; /* Number of blocks in the run */ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind corresponding to block_to_free */ - unsigned long nr; /* Current block # */ + ext3_fsblk_t nr; /* Current block # */ __le32 *p; /* Pointer into inode/ind for current block */ int err; @@ -2089,7 +2088,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, struct buffer_head *parent_bh, __le32 *first, __le32 *last, int depth) { - unsigned long nr; + ext3_fsblk_t nr; __le32 *p; if (is_handle_aborted(handle)) @@ -2113,7 +2112,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, */ if (!bh) { ext3_error(inode->i_sb, "ext3_free_branches", - "Read failure, inode=%ld, block=%ld", + "Read failure, inode=%ld, block="E3FSBLK, inode->i_ino, nr); continue; } @@ -2394,11 +2393,12 @@ out_stop: ext3_journal_stop(handle); } -static unsigned long ext3_get_inode_block(struct super_block *sb, +static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, unsigned long ino, struct ext3_iloc *iloc) { unsigned long desc, group_desc, block_group; - unsigned long offset, block; + unsigned long offset; + ext3_fsblk_t block; struct buffer_head *bh; struct ext3_group_desc * gdp; @@ -2448,7 +2448,7 @@ static unsigned long ext3_get_inode_block(struct super_block *sb, static int __ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc, int in_mem) { - unsigned long block; + ext3_fsblk_t block; struct buffer_head *bh; block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc); @@ -2459,7 +2459,8 @@ static int __ext3_get_inode_loc(struct inode *inode, if (!bh) { ext3_error (inode->i_sb, "ext3_get_inode_loc", "unable to read inode block - " - "inode=%lu, block=%lu", inode->i_ino, block); + "inode=%lu, block="E3FSBLK, + inode->i_ino, block); return -EIO; } if (!buffer_uptodate(bh)) { @@ -2540,7 +2541,7 @@ make_io: if (!buffer_uptodate(bh)) { ext3_error(inode->i_sb, "ext3_get_inode_loc", "unable to read inode block - " - "inode=%lu, block=%lu", + "inode=%lu, block="E3FSBLK, inode->i_ino, block); brelse(bh); return -EIO; diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 8c22aa9a7fbb..3a6b012d120c 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -204,7 +204,7 @@ flags_err: return 0; } case EXT3_IOC_GROUP_EXTEND: { - unsigned long n_blocks_count; + ext3_fsblk_t n_blocks_count; struct super_block *sb = inode->i_sb; int err; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index b8f5cd1e540d..d9176dba3698 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1379,7 +1379,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry, int dx_fallback=0; #endif unsigned blocksize; - unsigned nlen, rlen; u32 block, blocks; sb = dir->i_sb; @@ -1417,8 +1416,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry, return retval; de = (struct ext3_dir_entry_2 *) bh->b_data; de->inode = 0; - de->rec_len = cpu_to_le16(rlen = blocksize); - nlen = 0; + de->rec_len = cpu_to_le16(blocksize); return add_dirent_to_buf(handle, dentry, inode, de, bh); } diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 34b39e9a1e5a..dfd811895d8f 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -28,16 +28,16 @@ static int verify_group_input(struct super_block *sb, { struct ext3_sb_info *sbi = EXT3_SB(sb); struct ext3_super_block *es = sbi->s_es; - unsigned start = le32_to_cpu(es->s_blocks_count); - unsigned end = start + input->blocks_count; + ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count); + ext3_fsblk_t end = start + input->blocks_count; unsigned group = input->group; - unsigned itend = input->inode_table + sbi->s_itb_per_group; + ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; unsigned overhead = ext3_bg_has_super(sb, group) ? (1 + ext3_bg_num_gdb(sb, group) + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; - unsigned metaend = start + overhead; + ext3_fsblk_t metaend = start + overhead; struct buffer_head *bh = NULL; - int free_blocks_count; + ext3_grpblk_t free_blocks_count; int err = -EINVAL; input->free_blocks_count = free_blocks_count = @@ -64,7 +64,8 @@ static int verify_group_input(struct super_block *sb, ext3_warning(sb, __FUNCTION__, "Bad blocks count %u", input->blocks_count); else if (!(bh = sb_bread(sb, end - 1))) - ext3_warning(sb, __FUNCTION__, "Cannot read last block (%u)", + ext3_warning(sb, __FUNCTION__, + "Cannot read last block ("E3FSBLK")", end - 1); else if (outside(input->block_bitmap, start, end)) ext3_warning(sb, __FUNCTION__, @@ -77,7 +78,7 @@ static int verify_group_input(struct super_block *sb, else if (outside(input->inode_table, start, end) || outside(itend - 1, start, end)) ext3_warning(sb, __FUNCTION__, - "Inode table not in group (blocks %u-%u)", + "Inode table not in group (blocks %u-"E3FSBLK")", input->inode_table, itend - 1); else if (input->inode_bitmap == input->block_bitmap) ext3_warning(sb, __FUNCTION__, @@ -85,24 +86,27 @@ static int verify_group_input(struct super_block *sb, input->block_bitmap); else if (inside(input->block_bitmap, input->inode_table, itend)) ext3_warning(sb, __FUNCTION__, - "Block bitmap (%u) in inode table (%u-%u)", + "Block bitmap (%u) in inode table (%u-"E3FSBLK")", input->block_bitmap, input->inode_table, itend-1); else if (inside(input->inode_bitmap, input->inode_table, itend)) ext3_warning(sb, __FUNCTION__, - "Inode bitmap (%u) in inode table (%u-%u)", + "Inode bitmap (%u) in inode table (%u-"E3FSBLK")", input->inode_bitmap, input->inode_table, itend-1); else if (inside(input->block_bitmap, start, metaend)) ext3_warning(sb, __FUNCTION__, - "Block bitmap (%u) in GDT table (%u-%u)", + "Block bitmap (%u) in GDT table" + " ("E3FSBLK"-"E3FSBLK")", input->block_bitmap, start, metaend - 1); else if (inside(input->inode_bitmap, start, metaend)) ext3_warning(sb, __FUNCTION__, - "Inode bitmap (%u) in GDT table (%u-%u)", + "Inode bitmap (%u) in GDT table" + " ("E3FSBLK"-"E3FSBLK")", input->inode_bitmap, start, metaend - 1); else if (inside(input->inode_table, start, metaend) || inside(itend - 1, start, metaend)) ext3_warning(sb, __FUNCTION__, - "Inode table (%u-%u) overlaps GDT table (%u-%u)", + "Inode table (%u-"E3FSBLK") overlaps" + "GDT table ("E3FSBLK"-"E3FSBLK")", input->inode_table, itend - 1, start, metaend - 1); else err = 0; @@ -112,7 +116,7 @@ static int verify_group_input(struct super_block *sb, } static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, - unsigned long blk) + ext3_fsblk_t blk) { struct buffer_head *bh; int err; @@ -163,15 +167,14 @@ static int setup_new_group_blocks(struct super_block *sb, struct ext3_new_group_data *input) { struct ext3_sb_info *sbi = EXT3_SB(sb); - unsigned long start = input->group * sbi->s_blocks_per_group + - le32_to_cpu(sbi->s_es->s_first_data_block); + ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group); int reserved_gdb = ext3_bg_has_super(sb, input->group) ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group); struct buffer_head *bh; handle_t *handle; - unsigned long block; - int bit; + ext3_fsblk_t block; + ext3_grpblk_t bit; int i; int err = 0, err2; @@ -328,7 +331,7 @@ static unsigned ext3_list_backups(struct super_block *sb, unsigned *three, static int verify_reserved_gdb(struct super_block *sb, struct buffer_head *primary) { - const unsigned long blk = primary->b_blocknr; + const ext3_fsblk_t blk = primary->b_blocknr; const unsigned long end = EXT3_SB(sb)->s_groups_count; unsigned three = 1; unsigned five = 5; @@ -340,7 +343,8 @@ static int verify_reserved_gdb(struct super_block *sb, while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ ext3_warning(sb, __FUNCTION__, - "reserved GDT %ld missing grp %d (%ld)", + "reserved GDT "E3FSBLK + " missing grp %d ("E3FSBLK")", blk, grp, grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); return -EINVAL; @@ -372,7 +376,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct ext3_super_block *es = EXT3_SB(sb)->s_es; unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); - unsigned long gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; + ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; struct buffer_head **o_group_desc, **n_group_desc; struct buffer_head *dind; int gdbackups; @@ -417,7 +421,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__u32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { ext3_warning(sb, __FUNCTION__, - "new group %u GDT block %lu not reserved", + "new group %u GDT block "E3FSBLK" not reserved", input->group, gdblock); err = -EINVAL; goto exit_dind; @@ -515,7 +519,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, struct buffer_head **primary; struct buffer_head *dind; struct ext3_iloc iloc; - unsigned long blk; + ext3_fsblk_t blk; __u32 *data, *end; int gdbackups = 0; int res, i; @@ -540,7 +544,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, for (res = 0; res < reserved_gdb; res++, blk++) { if (le32_to_cpu(*data) != blk) { ext3_warning(sb, __FUNCTION__, - "reserved block %lu not at offset %ld", + "reserved block "E3FSBLK + " not at offset %ld", blk, (long)(data - (__u32 *)dind->b_data)); err = -EINVAL; goto exit_bh; @@ -902,15 +907,16 @@ exit_put: * GDT blocks are reserved to grow to the desired size. */ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, - unsigned long n_blocks_count) + ext3_fsblk_t n_blocks_count) { - unsigned long o_blocks_count; + ext3_fsblk_t o_blocks_count; unsigned long o_groups_count; - unsigned long last; - int add; + ext3_grpblk_t last; + ext3_grpblk_t add; struct buffer_head * bh; handle_t *handle; - int err, freed_blocks; + int err; + unsigned long freed_blocks; /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after @@ -919,12 +925,22 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, o_groups_count = EXT3_SB(sb)->s_groups_count; if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT3-fs: extending last group from %lu to %lu blocks\n", + printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) return 0; + if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + printk(KERN_ERR "EXT3-fs: filesystem on %s:" + " too large to resize to %lu blocks safely\n", + sb->s_id, n_blocks_count); + if (sizeof(sector_t) < 8) + ext3_warning(sb, __FUNCTION__, + "CONFIG_LBD not enabled\n"); + return -EINVAL; + } + if (n_blocks_count < o_blocks_count) { ext3_warning(sb, __FUNCTION__, "can't shrink FS - resize aborted"); @@ -948,7 +964,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, if (o_blocks_count + add < n_blocks_count) ext3_warning(sb, __FUNCTION__, - "will only finish group (%lu blocks, %u new)", + "will only finish group ("E3FSBLK + " blocks, %u new)", o_blocks_count + add, add); /* See if the device is actually as big as what was requested */ @@ -991,10 +1008,10 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); sb->s_dirt = 1; unlock_super(sb); - ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count, + ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); - ext3_debug("freed blocks %ld through %ld\n", o_blocks_count, + ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); if ((err = ext3_journal_stop(handle))) goto exit_put; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index a60cc6ec130f..b7483360a2db 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -630,7 +630,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -666,6 +666,7 @@ static match_table_t tokens = { {Opt_noreservation, "noreservation"}, {Opt_noload, "noload"}, {Opt_nobh, "nobh"}, + {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, @@ -689,14 +690,15 @@ static match_table_t tokens = { {Opt_resize, "resize"}, }; -static unsigned long get_sb_block(void **data) +static ext3_fsblk_t get_sb_block(void **data) { - unsigned long sb_block; + ext3_fsblk_t sb_block; char *options = (char *) *data; if (!options || strncmp(options, "sb=", 3) != 0) return 1; /* Default location */ options += 3; + /*todo: use simple_strtoll with >32bit ext3 */ sb_block = simple_strtoul(options, &options, 0); if (*options && *options != ',') { printk("EXT3-fs: Invalid sb specification: %s\n", @@ -711,7 +713,7 @@ static unsigned long get_sb_block(void **data) static int parse_options (char *options, struct super_block *sb, unsigned long *inum, unsigned long *journal_devnum, - unsigned long *n_blocks_count, int is_remount) + ext3_fsblk_t *n_blocks_count, int is_remount) { struct ext3_sb_info *sbi = EXT3_SB(sb); char * p; @@ -1013,6 +1015,9 @@ clear_qf_name: case Opt_nobh: set_opt(sbi->s_mount_opt, NOBH); break; + case Opt_bh: + clear_opt(sbi->s_mount_opt, NOBH); + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " @@ -1128,7 +1133,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, static int ext3_check_descriptors (struct super_block * sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); - unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); + ext3_fsblk_t block = le32_to_cpu(sbi->s_es->s_first_data_block); struct ext3_group_desc * gdp = NULL; int desc_block = 0; int i; @@ -1315,15 +1320,14 @@ static loff_t ext3_max_size(int bits) return res; } -static unsigned long descriptor_loc(struct super_block *sb, - unsigned long logic_sb_block, +static ext3_fsblk_t descriptor_loc(struct super_block *sb, + ext3_fsblk_t logic_sb_block, int nr) { struct ext3_sb_info *sbi = EXT3_SB(sb); - unsigned long bg, first_data_block, first_meta_bg; + unsigned long bg, first_meta_bg; int has_super = 0; - first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block); first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) || @@ -1332,7 +1336,7 @@ static unsigned long descriptor_loc(struct super_block *sb, bg = sbi->s_desc_per_block * nr; if (ext3_bg_has_super(sb, bg)) has_super = 1; - return (first_data_block + has_super + (bg * sbi->s_blocks_per_group)); + return (has_super + ext3_group_first_block_no(sb, bg)); } @@ -1341,9 +1345,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) struct buffer_head * bh; struct ext3_super_block *es = NULL; struct ext3_sb_info *sbi; - unsigned long block; - unsigned long sb_block = get_sb_block(&data); - unsigned long logic_sb_block; + ext3_fsblk_t block; + ext3_fsblk_t sb_block = get_sb_block(&data); + ext3_fsblk_t logic_sb_block; unsigned long offset = 0; unsigned long journal_inum = 0; unsigned long journal_devnum = 0; @@ -1565,6 +1569,16 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } + if (le32_to_cpu(es->s_blocks_count) > + (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + printk(KERN_ERR "EXT3-fs: filesystem on %s:" + " too large to mount safely\n", sb->s_id); + if (sizeof(sector_t) < 8) + printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not " + "enabled\n"); + goto failed_mount; + } + if (EXT3_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext3; sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - @@ -1593,7 +1607,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } } if (!ext3_check_descriptors (sb)) { - printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n"); + printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n"); goto failed_mount2; } sbi->s_gdb_count = db_count; @@ -1830,10 +1844,10 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, { struct buffer_head * bh; journal_t *journal; - int start; - int len; + ext3_fsblk_t start; + ext3_fsblk_t len; int hblock, blocksize; - unsigned long sb_block; + ext3_fsblk_t sb_block; unsigned long offset; struct ext3_super_block * es; struct block_device *bdev; @@ -2206,7 +2220,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) { struct ext3_super_block * es; struct ext3_sb_info *sbi = EXT3_SB(sb); - unsigned long n_blocks_count = 0; + ext3_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext3_mount_options old_opts; int err; @@ -2326,7 +2340,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) struct super_block *sb = dentry->d_sb; struct ext3_sb_info *sbi = EXT3_SB(sb); struct ext3_super_block *es = sbi->s_es; - unsigned long overhead; + ext3_fsblk_t overhead; int i; if (test_opt (sb, MINIX_DF)) diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index e8d60bf6b7df..a44a0562203a 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -225,7 +225,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, error = -ENODATA; if (!EXT3_I(inode)->i_file_acl) goto cleanup; - ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); + ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl); bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); if (!bh) goto cleanup; @@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext3_xattr_check_block(bh)) { bad_block: ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: bad block %d", inode->i_ino, + "inode %ld: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -366,7 +366,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) error = 0; if (!EXT3_I(inode)->i_file_acl) goto cleanup; - ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl); + ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl); bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); error = -EIO; if (!bh) @@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext3_xattr_check_block(bh)) { ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: bad block %d", inode->i_ino, + "inode %ld: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i, le32_to_cpu(BHDR(bs->bh)->h_refcount)); if (ext3_xattr_check_block(bs->bh)) { ext3_error(sb, __FUNCTION__, - "inode %ld: bad block %d", inode->i_ino, + "inode %ld: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -792,11 +792,12 @@ inserted: get_bh(new_bh); } else { /* We need to allocate a new block */ - int goal = le32_to_cpu( + ext3_fsblk_t goal = le32_to_cpu( EXT3_SB(sb)->s_es->s_first_data_block) + - EXT3_I(inode)->i_block_group * + (ext3_fsblk_t)EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb); - int block = ext3_new_block(handle, inode, goal, &error); + ext3_fsblk_t block = ext3_new_block(handle, inode, + goal, &error); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); @@ -847,7 +848,7 @@ cleanup_dquot: bad_block: ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: bad block %d", inode->i_ino, + "inode %ld: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; @@ -1076,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); if (!bh) { ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: block %d read error", inode->i_ino, + "inode %ld: block "E3FSBLK" read error", inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; } if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: bad block %d", inode->i_ino, + "inode %ld: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; } @@ -1210,11 +1211,11 @@ again: bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: block %ld read error", + "inode %ld: block %lu read error", inode->i_ino, (unsigned long) ce->e_block); } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= EXT3_XATTR_REFCOUNT_MAX) { - ea_idebug(inode, "block %ld refcount %d>=%d", + ea_idebug(inode, "block %lu refcount %d>=%d", (unsigned long) ce->e_block, le32_to_cpu(BHDR(bh)->h_refcount), EXT3_XATTR_REFCOUNT_MAX); diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h index 583bd78086d8..d35979a58743 100644 --- a/fs/freevxfs/vxfs.h +++ b/fs/freevxfs/vxfs.h @@ -159,11 +159,11 @@ struct vxfs_sb { * In core superblock filesystem private data for VxFS. */ struct vxfs_sb_info { - struct vxfs_sb *vsi_raw; /* raw (on disk) supeblock */ + struct vxfs_sb *vsi_raw; /* raw (on disk) superblock */ struct buffer_head *vsi_bp; /* buffer for raw superblock*/ struct inode *vsi_fship; /* fileset header inode */ struct inode *vsi_ilist; /* inode list inode */ - struct inode *vsi_stilist; /* structual inode list inode */ + struct inode *vsi_stilist; /* structural inode list inode */ u_long vsi_iext; /* initial inode list */ ino_t vsi_fshino; /* fileset header inode */ daddr_t vsi_oltext; /* OLT extent */ diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c index 6dee109aeea4..78948b4b1894 100644 --- a/fs/freevxfs/vxfs_fshead.c +++ b/fs/freevxfs/vxfs_fshead.c @@ -112,7 +112,7 @@ vxfs_read_fshead(struct super_block *sbp) vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino); if (!vip) { - printk(KERN_ERR "vxfs: unabled to read fsh inode\n"); + printk(KERN_ERR "vxfs: unable to read fsh inode\n"); return -EINVAL; } if (!VXFS_ISFSH(vip)) { @@ -129,13 +129,13 @@ vxfs_read_fshead(struct super_block *sbp) infp->vsi_fship = vxfs_get_fake_inode(sbp, vip); if (!infp->vsi_fship) { - printk(KERN_ERR "vxfs: unabled to get fsh inode\n"); + printk(KERN_ERR "vxfs: unable to get fsh inode\n"); goto out_free_fship; } sfp = vxfs_getfsh(infp->vsi_fship, 0); if (!sfp) { - printk(KERN_ERR "vxfs: unabled to get structural fsh\n"); + printk(KERN_ERR "vxfs: unable to get structural fsh\n"); goto out_iput_fship; } @@ -145,7 +145,7 @@ vxfs_read_fshead(struct super_block *sbp) pfp = vxfs_getfsh(infp->vsi_fship, 1); if (!pfp) { - printk(KERN_ERR "vxfs: unabled to get primary fsh\n"); + printk(KERN_ERR "vxfs: unable to get primary fsh\n"); goto out_free_sfp; } @@ -159,7 +159,7 @@ vxfs_read_fshead(struct super_block *sbp) infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip); if (!infp->vsi_stilist) { - printk(KERN_ERR "vxfs: unabled to get structual list inode\n"); + printk(KERN_ERR "vxfs: unable to get structural list inode\n"); kfree(tip); goto out_free_pfp; } @@ -174,7 +174,7 @@ vxfs_read_fshead(struct super_block *sbp) goto out_iput_stilist; infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip); if (!infp->vsi_ilist) { - printk(KERN_ERR "vxfs: unabled to get inode list inode\n"); + printk(KERN_ERR "vxfs: unable to get inode list inode\n"); kfree(tip); goto out_iput_stilist; } diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index c3e1f760cac9..72437065f6ad 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_FUSE_FS) += fuse.o -fuse-objs := dev.o dir.o file.o inode.o +fuse-objs := dev.o dir.o file.o inode.o control.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c new file mode 100644 index 000000000000..a3bce3a77253 --- /dev/null +++ b/fs/fuse/control.c @@ -0,0 +1,218 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +#include "fuse_i.h" + +#include <linux/init.h> +#include <linux/module.h> + +#define FUSE_CTL_SUPER_MAGIC 0x65735543 + +/* + * This is non-NULL when the single instance of the control filesystem + * exists. Protected by fuse_mutex + */ +static struct super_block *fuse_control_sb; + +static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) +{ + struct fuse_conn *fc; + mutex_lock(&fuse_mutex); + fc = file->f_dentry->d_inode->u.generic_ip; + if (fc) + fc = fuse_conn_get(fc); + mutex_unlock(&fuse_mutex); + return fc; +} + +static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (fc) { + fuse_abort_conn(fc); + fuse_conn_put(fc); + } + return count; +} + +static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + char tmp[32]; + size_t size; + + if (!*ppos) { + struct fuse_conn *fc = fuse_ctl_file_conn_get(file); + if (!fc) + return 0; + + file->private_data=(void *)(long)atomic_read(&fc->num_waiting); + fuse_conn_put(fc); + } + size = sprintf(tmp, "%ld\n", (long)file->private_data); + return simple_read_from_buffer(buf, len, ppos, tmp, size); +} + +static const struct file_operations fuse_ctl_abort_ops = { + .open = nonseekable_open, + .write = fuse_conn_abort_write, +}; + +static const struct file_operations fuse_ctl_waiting_ops = { + .open = nonseekable_open, + .read = fuse_conn_waiting_read, +}; + +static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, + struct fuse_conn *fc, + const char *name, + int mode, int nlink, + struct inode_operations *iop, + const struct file_operations *fop) +{ + struct dentry *dentry; + struct inode *inode; + + BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES); + dentry = d_alloc_name(parent, name); + if (!dentry) + return NULL; + + fc->ctl_dentry[fc->ctl_ndents++] = dentry; + inode = new_inode(fuse_control_sb); + if (!inode) + return NULL; + + inode->i_mode = mode; + inode->i_uid = fc->user_id; + inode->i_gid = fc->group_id; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + /* setting ->i_op to NULL is not allowed */ + if (iop) + inode->i_op = iop; + inode->i_fop = fop; + inode->i_nlink = nlink; + inode->u.generic_ip = fc; + d_add(dentry, inode); + return dentry; +} + +/* + * Add a connection to the control filesystem (if it exists). Caller + * must host fuse_mutex + */ +int fuse_ctl_add_conn(struct fuse_conn *fc) +{ + struct dentry *parent; + char name[32]; + + if (!fuse_control_sb) + return 0; + + parent = fuse_control_sb->s_root; + parent->d_inode->i_nlink++; + sprintf(name, "%llu", (unsigned long long) fc->id); + parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, + &simple_dir_inode_operations, + &simple_dir_operations); + if (!parent) + goto err; + + if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, + NULL, &fuse_ctl_waiting_ops) || + !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, + NULL, &fuse_ctl_abort_ops)) + goto err; + + return 0; + + err: + fuse_ctl_remove_conn(fc); + return -ENOMEM; +} + +/* + * Remove a connection from the control filesystem (if it exists). + * Caller must host fuse_mutex + */ +void fuse_ctl_remove_conn(struct fuse_conn *fc) +{ + int i; + + if (!fuse_control_sb) + return; + + for (i = fc->ctl_ndents - 1; i >= 0; i--) { + struct dentry *dentry = fc->ctl_dentry[i]; + dentry->d_inode->u.generic_ip = NULL; + d_drop(dentry); + dput(dentry); + } + fuse_control_sb->s_root->d_inode->i_nlink--; +} + +static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct tree_descr empty_descr = {""}; + struct fuse_conn *fc; + int err; + + err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr); + if (err) + return err; + + mutex_lock(&fuse_mutex); + BUG_ON(fuse_control_sb); + fuse_control_sb = sb; + list_for_each_entry(fc, &fuse_conn_list, entry) { + err = fuse_ctl_add_conn(fc); + if (err) { + fuse_control_sb = NULL; + mutex_unlock(&fuse_mutex); + return err; + } + } + mutex_unlock(&fuse_mutex); + + return 0; +} + +static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data, + struct vfsmount *mnt) +{ + return get_sb_single(fs_type, flags, raw_data, + fuse_ctl_fill_super, mnt); +} + +static void fuse_ctl_kill_sb(struct super_block *sb) +{ + mutex_lock(&fuse_mutex); + fuse_control_sb = NULL; + mutex_unlock(&fuse_mutex); + + kill_litter_super(sb); +} + +static struct file_system_type fuse_ctl_fs_type = { + .owner = THIS_MODULE, + .name = "fusectl", + .get_sb = fuse_ctl_get_sb, + .kill_sb = fuse_ctl_kill_sb, +}; + +int __init fuse_ctl_init(void) +{ + return register_filesystem(&fuse_ctl_fs_type); +} + +void fuse_ctl_cleanup(void) +{ + unregister_filesystem(&fuse_ctl_fs_type); +} diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 104a62dadb94..1e2006caf158 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -34,6 +34,7 @@ static void fuse_request_init(struct fuse_req *req) { memset(req, 0, sizeof(*req)); INIT_LIST_HEAD(&req->list); + INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); atomic_set(&req->count, 1); } @@ -64,18 +65,6 @@ static void restore_sigs(sigset_t *oldset) sigprocmask(SIG_SETMASK, oldset, NULL); } -/* - * Reset request, so that it can be reused - * - * The caller must be _very_ careful to make sure, that it is holding - * the only reference to req - */ -void fuse_reset_request(struct fuse_req *req) -{ - BUG_ON(atomic_read(&req->count) != 1); - fuse_request_init(req); -} - static void __fuse_get_request(struct fuse_req *req) { atomic_inc(&req->count); @@ -88,6 +77,13 @@ static void __fuse_put_request(struct fuse_req *req) atomic_dec(&req->count); } +static void fuse_req_init_context(struct fuse_req *req) +{ + req->in.h.uid = current->fsuid; + req->in.h.gid = current->fsgid; + req->in.h.pid = current->pid; +} + struct fuse_req *fuse_get_req(struct fuse_conn *fc) { struct fuse_req *req; @@ -103,14 +99,16 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) if (intr) goto out; + err = -ENOTCONN; + if (!fc->connected) + goto out; + req = fuse_request_alloc(); err = -ENOMEM; if (!req) goto out; - req->in.h.uid = current->fsuid; - req->in.h.gid = current->fsgid; - req->in.h.pid = current->pid; + fuse_req_init_context(req); req->waiting = 1; return req; @@ -119,142 +117,183 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) return ERR_PTR(err); } -void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +/* + * Return request in fuse_file->reserved_req. However that may + * currently be in use. If that is the case, wait for it to become + * available. + */ +static struct fuse_req *get_reserved_req(struct fuse_conn *fc, + struct file *file) { - if (atomic_dec_and_test(&req->count)) { - if (req->waiting) - atomic_dec(&fc->num_waiting); - fuse_request_free(req); - } + struct fuse_req *req = NULL; + struct fuse_file *ff = file->private_data; + + do { + wait_event(fc->blocked_waitq, ff->reserved_req); + spin_lock(&fc->lock); + if (ff->reserved_req) { + req = ff->reserved_req; + ff->reserved_req = NULL; + get_file(file); + req->stolen_file = file; + } + spin_unlock(&fc->lock); + } while (!req); + + return req; } /* - * Called with sbput_sem held for read (request_end) or write - * (fuse_put_super). By the time fuse_put_super() is finished, all - * inodes belonging to background requests must be released, so the - * iputs have to be done within the locked region. + * Put stolen request back into fuse_file->reserved_req */ -void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) +static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) { - iput(req->inode); - iput(req->inode2); + struct file *file = req->stolen_file; + struct fuse_file *ff = file->private_data; + spin_lock(&fc->lock); - list_del(&req->bg_entry); - if (fc->num_background == FUSE_MAX_BACKGROUND) { - fc->blocked = 0; - wake_up_all(&fc->blocked_waitq); - } - fc->num_background--; + fuse_request_init(req); + BUG_ON(ff->reserved_req); + ff->reserved_req = req; + wake_up(&fc->blocked_waitq); spin_unlock(&fc->lock); + fput(file); } /* - * This function is called when a request is finished. Either a reply - * has arrived or it was interrupted (and not yet sent) or some error - * occurred during communication with userspace, or the device file - * was closed. In case of a background request the reference to the - * stored objects are released. The requester thread is woken up (if - * still waiting), the 'end' callback is called if given, else the - * reference to the request is released + * Gets a requests for a file operation, always succeeds * - * Releasing extra reference for foreground requests must be done - * within the same locked region as setting state to finished. This - * is because fuse_reset_request() may be called after request is - * finished and it must be the sole possessor. If request is - * interrupted and put in the background, it will return with an error - * and hence never be reset and reused. + * This is used for sending the FLUSH request, which must get to + * userspace, due to POSIX locks which may need to be unlocked. * - * Called with fc->lock, unlocks it + * If allocation fails due to OOM, use the reserved request in + * fuse_file. + * + * This is very unlikely to deadlock accidentally, since the + * filesystem should not have it's own file open. If deadlock is + * intentional, it can still be broken by "aborting" the filesystem. */ -static void request_end(struct fuse_conn *fc, struct fuse_req *req) +struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file) { - list_del(&req->list); - req->state = FUSE_REQ_FINISHED; - if (!req->background) { - spin_unlock(&fc->lock); - wake_up(&req->waitq); - fuse_put_request(fc, req); - } else { - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - req->end = NULL; - spin_unlock(&fc->lock); - down_read(&fc->sbput_sem); - if (fc->mounted) - fuse_release_background(fc, req); - up_read(&fc->sbput_sem); + struct fuse_req *req; - /* fput must go outside sbput_sem, otherwise it can deadlock */ - if (req->file) - fput(req->file); + atomic_inc(&fc->num_waiting); + wait_event(fc->blocked_waitq, !fc->blocked); + req = fuse_request_alloc(); + if (!req) + req = get_reserved_req(fc, file); - if (end) - end(fc, req); + fuse_req_init_context(req); + req->waiting = 1; + return req; +} + +void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +{ + if (atomic_dec_and_test(&req->count)) { + if (req->waiting) + atomic_dec(&fc->num_waiting); + + if (req->stolen_file) + put_reserved_req(fc, req); else - fuse_put_request(fc, req); + fuse_request_free(req); } } /* - * Unfortunately request interruption not just solves the deadlock - * problem, it causes problems too. These stem from the fact, that an - * interrupted request is continued to be processed in userspace, - * while all the locks and object references (inode and file) held - * during the operation are released. - * - * To release the locks is exactly why there's a need to interrupt the - * request, so there's not a lot that can be done about this, except - * introduce additional locking in userspace. - * - * More important is to keep inode and file references until userspace - * has replied, otherwise FORGET and RELEASE could be sent while the - * inode/file is still used by the filesystem. - * - * For this reason the concept of "background" request is introduced. - * An interrupted request is backgrounded if it has been already sent - * to userspace. Backgrounding involves getting an extra reference to - * inode(s) or file used in the request, and adding the request to - * fc->background list. When a reply is received for a background - * request, the object references are released, and the request is - * removed from the list. If the filesystem is unmounted while there - * are still background requests, the list is walked and references - * are released as if a reply was received. + * This function is called when a request is finished. Either a reply + * has arrived or it was aborted (and not yet sent) or some error + * occurred during communication with userspace, or the device file + * was closed. The requester thread is woken up (if still waiting), + * the 'end' callback is called if given, else the reference to the + * request is released * - * There's one more use for a background request. The RELEASE message is - * always sent as background, since it doesn't return an error or - * data. + * Called with fc->lock, unlocks it */ -static void background_request(struct fuse_conn *fc, struct fuse_req *req) -{ - req->background = 1; - list_add(&req->bg_entry, &fc->background); - fc->num_background++; - if (fc->num_background == FUSE_MAX_BACKGROUND) - fc->blocked = 1; - if (req->inode) - req->inode = igrab(req->inode); - if (req->inode2) - req->inode2 = igrab(req->inode2); +static void request_end(struct fuse_conn *fc, struct fuse_req *req) +{ + void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; + req->end = NULL; + list_del(&req->list); + list_del(&req->intr_entry); + req->state = FUSE_REQ_FINISHED; + if (req->background) { + if (fc->num_background == FUSE_MAX_BACKGROUND) { + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); + } + fc->num_background--; + } + spin_unlock(&fc->lock); + dput(req->dentry); + mntput(req->vfsmount); if (req->file) - get_file(req->file); + fput(req->file); + wake_up(&req->waitq); + if (end) + end(fc, req); + else + fuse_put_request(fc, req); } -/* Called with fc->lock held. Releases, and then reacquires it. */ -static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) +static void wait_answer_interruptible(struct fuse_conn *fc, + struct fuse_req *req) { - sigset_t oldset; + if (signal_pending(current)) + return; spin_unlock(&fc->lock); - block_sigs(&oldset); wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); - restore_sigs(&oldset); spin_lock(&fc->lock); - if (req->state == FUSE_REQ_FINISHED && !req->interrupted) - return; +} + +static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) +{ + list_add_tail(&req->intr_entry, &fc->interrupts); + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); +} + +/* Called with fc->lock held. Releases, and then reacquires it. */ +static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) +{ + if (!fc->no_interrupt) { + /* Any signal may interrupt this */ + wait_answer_interruptible(fc, req); + + if (req->aborted) + goto aborted; + if (req->state == FUSE_REQ_FINISHED) + return; - if (!req->interrupted) { - req->out.h.error = -EINTR; req->interrupted = 1; + if (req->state == FUSE_REQ_SENT) + queue_interrupt(fc, req); + } + + if (req->force) { + spin_unlock(&fc->lock); + wait_event(req->waitq, req->state == FUSE_REQ_FINISHED); + spin_lock(&fc->lock); + } else { + sigset_t oldset; + + /* Only fatal signals may interrupt this */ + block_sigs(&oldset); + wait_answer_interruptible(fc, req); + restore_sigs(&oldset); } + + if (req->aborted) + goto aborted; + if (req->state == FUSE_REQ_FINISHED) + return; + + req->out.h.error = -EINTR; + req->aborted = 1; + + aborted: if (req->locked) { /* This is uninterruptible sleep, because data is being copied to/from the buffers of req. During @@ -268,8 +307,11 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) if (req->state == FUSE_REQ_PENDING) { list_del(&req->list); __fuse_put_request(req); - } else if (req->state == FUSE_REQ_SENT) - background_request(fc, req); + } else if (req->state == FUSE_REQ_SENT) { + spin_unlock(&fc->lock); + wait_event(req->waitq, req->state == FUSE_REQ_FINISHED); + spin_lock(&fc->lock); + } } static unsigned len_args(unsigned numargs, struct fuse_arg *args) @@ -283,13 +325,19 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) return nbytes; } +static u64 fuse_get_unique(struct fuse_conn *fc) + { + fc->reqctr++; + /* zero is special */ + if (fc->reqctr == 0) + fc->reqctr = 1; + + return fc->reqctr; +} + static void queue_request(struct fuse_conn *fc, struct fuse_req *req) { - fc->reqctr++; - /* zero is special */ - if (fc->reqctr == 0) - fc->reqctr = 1; - req->in.h.unique = fc->reqctr; + req->in.h.unique = fuse_get_unique(fc); req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); list_add_tail(&req->list, &fc->pending); @@ -302,9 +350,6 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) kill_fasync(&fc->fasync, SIGIO, POLL_IN); } -/* - * This can only be interrupted by a SIGKILL - */ void request_send(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; @@ -327,8 +372,12 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) { spin_lock(&fc->lock); - background_request(fc, req); if (fc->connected) { + req->background = 1; + fc->num_background++; + if (fc->num_background == FUSE_MAX_BACKGROUND) + fc->blocked = 1; + queue_request(fc, req); spin_unlock(&fc->lock); } else { @@ -352,14 +401,14 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req) /* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already - * interrupted bail out. + * aborted bail out. */ static int lock_request(struct fuse_conn *fc, struct fuse_req *req) { int err = 0; if (req) { spin_lock(&fc->lock); - if (req->interrupted) + if (req->aborted) err = -ENOENT; else req->locked = 1; @@ -369,7 +418,7 @@ static int lock_request(struct fuse_conn *fc, struct fuse_req *req) } /* - * Unlock request. If it was interrupted during being locked, the + * Unlock request. If it was aborted during being locked, the * requester thread is currently waiting for it to be unlocked, so * wake it up. */ @@ -378,7 +427,7 @@ static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) if (req) { spin_lock(&fc->lock); req->locked = 0; - if (req->interrupted) + if (req->aborted) wake_up(&req->waitq); spin_unlock(&fc->lock); } @@ -557,13 +606,18 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, return err; } +static int request_pending(struct fuse_conn *fc) +{ + return !list_empty(&fc->pending) || !list_empty(&fc->interrupts); +} + /* Wait until a request is available on the pending list */ static void request_wait(struct fuse_conn *fc) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&fc->waitq, &wait); - while (fc->connected && list_empty(&fc->pending)) { + while (fc->connected && !request_pending(fc)) { set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) break; @@ -577,11 +631,50 @@ static void request_wait(struct fuse_conn *fc) } /* + * Transfer an interrupt request to userspace + * + * Unlike other requests this is assembled on demand, without a need + * to allocate a separate fuse_req structure. + * + * Called with fc->lock held, releases it + */ +static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, + const struct iovec *iov, unsigned long nr_segs) +{ + struct fuse_copy_state cs; + struct fuse_in_header ih; + struct fuse_interrupt_in arg; + unsigned reqsize = sizeof(ih) + sizeof(arg); + int err; + + list_del_init(&req->intr_entry); + req->intr_unique = fuse_get_unique(fc); + memset(&ih, 0, sizeof(ih)); + memset(&arg, 0, sizeof(arg)); + ih.len = reqsize; + ih.opcode = FUSE_INTERRUPT; + ih.unique = req->intr_unique; + arg.unique = req->in.h.unique; + + spin_unlock(&fc->lock); + if (iov_length(iov, nr_segs) < reqsize) + return -EINVAL; + + fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs); + err = fuse_copy_one(&cs, &ih, sizeof(ih)); + if (!err) + err = fuse_copy_one(&cs, &arg, sizeof(arg)); + fuse_copy_finish(&cs); + + return err ? err : reqsize; +} + +/* * Read a single request into the userspace filesystem's buffer. This * function waits until a request is available, then removes it from * the pending list and copies request data to userspace buffer. If - * no reply is needed (FORGET) or request has been interrupted or - * there was an error during the copying then it's finished by calling + * no reply is needed (FORGET) or request has been aborted or there + * was an error during the copying then it's finished by calling * request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ @@ -601,7 +694,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, spin_lock(&fc->lock); err = -EAGAIN; if ((file->f_flags & O_NONBLOCK) && fc->connected && - list_empty(&fc->pending)) + !request_pending(fc)) goto err_unlock; request_wait(fc); @@ -609,9 +702,15 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, if (!fc->connected) goto err_unlock; err = -ERESTARTSYS; - if (list_empty(&fc->pending)) + if (!request_pending(fc)) goto err_unlock; + if (!list_empty(&fc->interrupts)) { + req = list_entry(fc->interrupts.next, struct fuse_req, + intr_entry); + return fuse_read_interrupt(fc, req, iov, nr_segs); + } + req = list_entry(fc->pending.next, struct fuse_req, list); req->state = FUSE_REQ_READING; list_move(&req->list, &fc->io); @@ -636,10 +735,10 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, fuse_copy_finish(&cs); spin_lock(&fc->lock); req->locked = 0; - if (!err && req->interrupted) + if (!err && req->aborted) err = -ENOENT; if (err) { - if (!req->interrupted) + if (!req->aborted) req->out.h.error = -EIO; request_end(fc, req); return err; @@ -649,6 +748,8 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, else { req->state = FUSE_REQ_SENT; list_move_tail(&req->list, &fc->processing); + if (req->interrupted) + queue_interrupt(fc, req); spin_unlock(&fc->lock); } return reqsize; @@ -675,7 +776,7 @@ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) list_for_each(entry, &fc->processing) { struct fuse_req *req; req = list_entry(entry, struct fuse_req, list); - if (req->in.h.unique == unique) + if (req->in.h.unique == unique || req->intr_unique == unique) return req; } return NULL; @@ -741,17 +842,33 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, goto err_unlock; req = request_find(fc, oh.unique); - err = -EINVAL; if (!req) goto err_unlock; - if (req->interrupted) { + if (req->aborted) { spin_unlock(&fc->lock); fuse_copy_finish(&cs); spin_lock(&fc->lock); request_end(fc, req); return -ENOENT; } + /* Is it an interrupt reply? */ + if (req->intr_unique == oh.unique) { + err = -EINVAL; + if (nbytes != sizeof(struct fuse_out_header)) + goto err_unlock; + + if (oh.error == -ENOSYS) + fc->no_interrupt = 1; + else if (oh.error == -EAGAIN) + queue_interrupt(fc, req); + + spin_unlock(&fc->lock); + fuse_copy_finish(&cs); + return nbytes; + } + + req->state = FUSE_REQ_WRITING; list_move(&req->list, &fc->io); req->out.h = oh; req->locked = 1; @@ -764,9 +881,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, spin_lock(&fc->lock); req->locked = 0; if (!err) { - if (req->interrupted) + if (req->aborted) err = -ENOENT; - } else if (!req->interrupted) + } else if (!req->aborted) req->out.h.error = -EIO; request_end(fc, req); @@ -800,7 +917,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) spin_lock(&fc->lock); if (!fc->connected) mask = POLLERR; - else if (!list_empty(&fc->pending)) + else if (request_pending(fc)) mask |= POLLIN | POLLRDNORM; spin_unlock(&fc->lock); @@ -826,7 +943,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) /* * Abort requests under I/O * - * The requests are set to interrupted and finished, and the request + * The requests are set to aborted and finished, and the request * waiter is woken up. This will make request_wait_answer() wait * until the request is unlocked and then return. * @@ -841,7 +958,7 @@ static void end_io_requests(struct fuse_conn *fc) list_entry(fc->io.next, struct fuse_req, list); void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - req->interrupted = 1; + req->aborted = 1; req->out.h.error = -ECONNABORTED; req->state = FUSE_REQ_FINISHED; list_del_init(&req->list); @@ -874,19 +991,20 @@ static void end_io_requests(struct fuse_conn *fc) * onto the pending list is prevented by req->connected being false. * * Progression of requests under I/O to the processing list is - * prevented by the req->interrupted flag being true for these - * requests. For this reason requests on the io list must be aborted - * first. + * prevented by the req->aborted flag being true for these requests. + * For this reason requests on the io list must be aborted first. */ void fuse_abort_conn(struct fuse_conn *fc) { spin_lock(&fc->lock); if (fc->connected) { fc->connected = 0; + fc->blocked = 0; end_io_requests(fc); end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); wake_up_all(&fc->waitq); + wake_up_all(&fc->blocked_waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); } spin_unlock(&fc->lock); @@ -902,7 +1020,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file) end_requests(fc, &fc->processing); spin_unlock(&fc->lock); fasync_helper(-1, file, 0, &fc->fasync); - kobject_put(&fc->kobj); + fuse_conn_put(fc); } return 0; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8d7546e832e8..72a74cde6de8 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -79,7 +79,6 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir, { req->in.h.opcode = FUSE_LOOKUP; req->in.h.nodeid = get_node_id(dir); - req->inode = dir; req->in.numargs = 1; req->in.args[0].size = entry->d_name.len + 1; req->in.args[0].value = entry->d_name.name; @@ -225,6 +224,20 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, } /* + * Synchronous release for the case when something goes wrong in CREATE_OPEN + */ +static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff, + u64 nodeid, int flags) +{ + struct fuse_req *req; + + req = fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE); + req->force = 1; + request_send(fc, req); + fuse_put_request(fc, req); +} + +/* * Atomic create+open operation * * If the filesystem doesn't support this, then fall back to separate @@ -237,6 +250,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, struct inode *inode; struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_req *req; + struct fuse_req *forget_req; struct fuse_open_in inarg; struct fuse_open_out outopen; struct fuse_entry_out outentry; @@ -247,9 +261,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, if (fc->no_create) return -ENOSYS; + forget_req = fuse_get_req(fc); + if (IS_ERR(forget_req)) + return PTR_ERR(forget_req); + req = fuse_get_req(fc); + err = PTR_ERR(req); if (IS_ERR(req)) - return PTR_ERR(req); + goto out_put_forget_req; err = -ENOMEM; ff = fuse_file_alloc(); @@ -262,7 +281,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, inarg.mode = mode; req->in.h.opcode = FUSE_CREATE; req->in.h.nodeid = get_node_id(dir); - req->inode = dir; req->in.numargs = 2; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -285,25 +303,23 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) goto out_free_ff; + fuse_put_request(fc, req); inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, &outentry.attr); - err = -ENOMEM; if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); ff->fh = outopen.fh; - /* Special release, with inode = NULL, this will - trigger a 'forget' request when the release is - complete */ - fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0); - goto out_put_request; + fuse_sync_release(fc, ff, outentry.nodeid, flags); + fuse_send_forget(fc, forget_req, outentry.nodeid, 1); + return -ENOMEM; } - fuse_put_request(fc, req); + fuse_put_request(fc, forget_req); d_instantiate(entry, inode); fuse_change_timeout(entry, &outentry); file = lookup_instantiate_filp(nd, entry, generic_file_open); if (IS_ERR(file)) { ff->fh = outopen.fh; - fuse_send_release(fc, ff, outentry.nodeid, inode, flags, 0); + fuse_sync_release(fc, ff, outentry.nodeid, flags); return PTR_ERR(file); } fuse_finish_open(inode, file, ff, &outopen); @@ -313,6 +329,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, fuse_file_free(ff); out_put_request: fuse_put_request(fc, req); + out_put_forget_req: + fuse_put_request(fc, forget_req); return err; } @@ -328,7 +346,6 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, int err; req->in.h.nodeid = get_node_id(dir); - req->inode = dir; req->out.numargs = 1; req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; @@ -448,7 +465,6 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) req->in.h.opcode = FUSE_UNLINK; req->in.h.nodeid = get_node_id(dir); - req->inode = dir; req->in.numargs = 1; req->in.args[0].size = entry->d_name.len + 1; req->in.args[0].value = entry->d_name.name; @@ -480,7 +496,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) req->in.h.opcode = FUSE_RMDIR; req->in.h.nodeid = get_node_id(dir); - req->inode = dir; req->in.numargs = 1; req->in.args[0].size = entry->d_name.len + 1; req->in.args[0].value = entry->d_name.name; @@ -510,8 +525,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, inarg.newdir = get_node_id(newdir); req->in.h.opcode = FUSE_RENAME; req->in.h.nodeid = get_node_id(olddir); - req->inode = olddir; - req->inode2 = newdir; req->in.numargs = 3; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -558,7 +571,6 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); req->in.h.opcode = FUSE_LINK; - req->inode2 = inode; req->in.numargs = 2; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -587,7 +599,6 @@ int fuse_do_getattr(struct inode *inode) req->in.h.opcode = FUSE_GETATTR; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; req->out.numargs = 1; req->out.args[0].size = sizeof(arg); req->out.args[0].value = &arg; @@ -679,7 +690,6 @@ static int fuse_access(struct inode *inode, int mask) inarg.mask = mask; req->in.h.opcode = FUSE_ACCESS; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -820,7 +830,6 @@ static char *read_link(struct dentry *dentry) } req->in.h.opcode = FUSE_READLINK; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; req->out.argvar = 1; req->out.numargs = 1; req->out.args[0].size = PAGE_SIZE - 1; @@ -939,7 +948,6 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) iattr_to_fattr(attr, &inarg); req->in.h.opcode = FUSE_SETATTR; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -1002,7 +1010,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name, inarg.flags = flags; req->in.h.opcode = FUSE_SETXATTR; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; req->in.numargs = 3; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -1041,7 +1048,6 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, inarg.size = size; req->in.h.opcode = FUSE_GETXATTR; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; req->in.numargs = 2; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -1091,7 +1097,6 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) inarg.size = size; req->in.h.opcode = FUSE_LISTXATTR; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -1135,7 +1140,6 @@ static int fuse_removexattr(struct dentry *entry, const char *name) req->in.h.opcode = FUSE_REMOVEXATTR; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; req->in.numargs = 1; req->in.args[0].size = strlen(name) + 1; req->in.args[0].value = name; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 087f3b734f40..28aa81eae2cc 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -30,7 +30,6 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir, inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -49,8 +48,8 @@ struct fuse_file *fuse_file_alloc(void) struct fuse_file *ff; ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); if (ff) { - ff->release_req = fuse_request_alloc(); - if (!ff->release_req) { + ff->reserved_req = fuse_request_alloc(); + if (!ff->reserved_req) { kfree(ff); ff = NULL; } @@ -60,7 +59,7 @@ struct fuse_file *fuse_file_alloc(void) void fuse_file_free(struct fuse_file *ff) { - fuse_request_free(ff->release_req); + fuse_request_free(ff->reserved_req); kfree(ff); } @@ -113,37 +112,22 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir) return err; } -/* Special case for failed iget in CREATE */ -static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) +struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, + int opcode) { - /* If called from end_io_requests(), req has more than one - reference and fuse_reset_request() cannot work */ - if (fc->connected) { - u64 nodeid = req->in.h.nodeid; - fuse_reset_request(req); - fuse_send_forget(fc, req, nodeid, 1); - } else - fuse_put_request(fc, req); -} - -void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff, - u64 nodeid, struct inode *inode, int flags, int isdir) -{ - struct fuse_req * req = ff->release_req; + struct fuse_req *req = ff->reserved_req; struct fuse_release_in *inarg = &req->misc.release_in; inarg->fh = ff->fh; inarg->flags = flags; - req->in.h.opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; + req->in.h.opcode = opcode; req->in.h.nodeid = nodeid; - req->inode = inode; req->in.numargs = 1; req->in.args[0].size = sizeof(struct fuse_release_in); req->in.args[0].value = inarg; - request_send_background(fc, req); - if (!inode) - req->end = fuse_release_end; kfree(ff); + + return req; } int fuse_release_common(struct inode *inode, struct file *file, int isdir) @@ -151,8 +135,15 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir) struct fuse_file *ff = file->private_data; if (ff) { struct fuse_conn *fc = get_fuse_conn(inode); - u64 nodeid = get_node_id(inode); - fuse_send_release(fc, ff, nodeid, inode, file->f_flags, isdir); + struct fuse_req *req; + + req = fuse_release_fill(ff, get_node_id(inode), file->f_flags, + isdir ? FUSE_RELEASEDIR : FUSE_RELEASE); + + /* Hold vfsmount and dentry until release is finished */ + req->vfsmount = mntget(file->f_vfsmnt); + req->dentry = dget(file->f_dentry); + request_send_background(fc, req); } /* Return value is ignored by VFS */ @@ -169,6 +160,28 @@ static int fuse_release(struct inode *inode, struct file *file) return fuse_release_common(inode, file, 0); } +/* + * Scramble the ID space with XTEA, so that the value of the files_struct + * pointer is not exposed to userspace. + */ +static u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) +{ + u32 *k = fc->scramble_key; + u64 v = (unsigned long) id; + u32 v0 = v; + u32 v1 = v >> 32; + u32 sum = 0; + int i; + + for (i = 0; i < 32; i++) { + v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); + sum += 0x9E3779B9; + v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); + } + + return (u64) v0 + ((u64) v1 << 32); +} + static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file->f_dentry->d_inode; @@ -184,19 +197,16 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; - req = fuse_get_req(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - + req = fuse_get_req_nofail(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; + inarg.lock_owner = fuse_lock_owner_id(fc, id); req->in.h.opcode = FUSE_FLUSH; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; - req->file = file; req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; + req->force = 1; request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); @@ -232,8 +242,6 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, inarg.fsync_flags = datasync ? 1 : 0; req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; - req->file = file; req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; @@ -266,8 +274,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, inarg->size = count; req->in.h.opcode = opcode; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; - req->file = file; req->in.numargs = 1; req->in.args[0].size = sizeof(struct fuse_read_in); req->in.args[0].value = inarg; @@ -342,6 +348,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file, req->out.page_zeroing = 1; fuse_read_fill(req, file, inode, pos, count, FUSE_READ); if (fc->async_read) { + get_file(file); + req->file = file; req->end = fuse_readpages_end; request_send_background(fc, req); } else { @@ -420,8 +428,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file, inarg.size = count; req->in.h.opcode = FUSE_WRITE; req->in.h.nodeid = get_node_id(inode); - req->inode = inode; - req->file = file; req->in.argpages = 1; req->in.numargs = 2; req->in.args[0].size = sizeof(struct fuse_write_in); @@ -619,6 +625,126 @@ static int fuse_set_page_dirty(struct page *page) return 0; } +static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, + struct file_lock *fl) +{ + switch (ffl->type) { + case F_UNLCK: + break; + + case F_RDLCK: + case F_WRLCK: + if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX || + ffl->end < ffl->start) + return -EIO; + + fl->fl_start = ffl->start; + fl->fl_end = ffl->end; + fl->fl_pid = ffl->pid; + break; + + default: + return -EIO; + } + fl->fl_type = ffl->type; + return 0; +} + +static void fuse_lk_fill(struct fuse_req *req, struct file *file, + const struct file_lock *fl, int opcode, pid_t pid) +{ + struct inode *inode = file->f_dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + struct fuse_lk_in *arg = &req->misc.lk_in; + + arg->fh = ff->fh; + arg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + arg->lk.start = fl->fl_start; + arg->lk.end = fl->fl_end; + arg->lk.type = fl->fl_type; + arg->lk.pid = pid; + req->in.h.opcode = opcode; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(*arg); + req->in.args[0].value = arg; +} + +static int fuse_getlk(struct file *file, struct file_lock *fl) +{ + struct inode *inode = file->f_dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_lk_out outarg; + int err; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + fuse_lk_fill(req, file, fl, FUSE_GETLK, 0); + req->out.numargs = 1; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) + err = convert_fuse_file_lock(&outarg.lk, fl); + + return err; +} + +static int fuse_setlk(struct file *file, struct file_lock *fl) +{ + struct inode *inode = file->f_dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; + pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; + int err; + + /* Unlock on close is handled by the flush method */ + if (fl->fl_flags & FL_CLOSE) + return 0; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + fuse_lk_fill(req, file, fl, opcode, pid); + request_send(fc, req); + err = req->out.h.error; + /* locking is restartable */ + if (err == -EINTR) + err = -ERESTARTSYS; + fuse_put_request(fc, req); + return err; +} + +static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + if (cmd == F_GETLK) { + if (fc->no_lock) { + if (!posix_test_lock(file, fl, fl)) + fl->fl_type = F_UNLCK; + err = 0; + } else + err = fuse_getlk(file, fl); + } else { + if (fc->no_lock) + err = posix_lock_file_wait(file, fl); + else + err = fuse_setlk(file, fl); + } + return err; +} + static const struct file_operations fuse_file_operations = { .llseek = generic_file_llseek, .read = generic_file_read, @@ -628,6 +754,7 @@ static const struct file_operations fuse_file_operations = { .flush = fuse_flush, .release = fuse_release, .fsync = fuse_fsync, + .lock = fuse_file_lock, .sendfile = generic_file_sendfile, }; @@ -639,6 +766,7 @@ static const struct file_operations fuse_direct_io_file_operations = { .flush = fuse_flush, .release = fuse_release, .fsync = fuse_fsync, + .lock = fuse_file_lock, /* no mmap and sendfile */ }; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0474202cb5dc..0dbf96621841 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -8,12 +8,13 @@ #include <linux/fuse.h> #include <linux/fs.h> +#include <linux/mount.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/backing-dev.h> -#include <asm/semaphore.h> +#include <linux/mutex.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -24,6 +25,9 @@ /** It could be as large as PATH_MAX, but would that have any uses? */ #define FUSE_NAME_MAX 1024 +/** Number of dentries for each connection in the control filesystem */ +#define FUSE_CTL_NUM_DENTRIES 3 + /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem module will check permissions based on the file mode. Otherwise no permission checking is done in the kernel */ @@ -33,6 +37,11 @@ doing the mount will be allowed to access the filesystem */ #define FUSE_ALLOW_OTHER (1 << 1) +/** List of active connections */ +extern struct list_head fuse_conn_list; + +/** Global mutex protecting fuse_conn_list and the control filesystem */ +extern struct mutex fuse_mutex; /** FUSE inode */ struct fuse_inode { @@ -56,7 +65,7 @@ struct fuse_inode { /** FUSE specific file data */ struct fuse_file { /** Request reserved for flush and release */ - struct fuse_req *release_req; + struct fuse_req *reserved_req; /** File handle used by userspace */ u64 fh; @@ -122,6 +131,7 @@ enum fuse_req_state { FUSE_REQ_PENDING, FUSE_REQ_READING, FUSE_REQ_SENT, + FUSE_REQ_WRITING, FUSE_REQ_FINISHED }; @@ -135,12 +145,15 @@ struct fuse_req { fuse_conn */ struct list_head list; - /** Entry on the background list */ - struct list_head bg_entry; + /** Entry on the interrupts list */ + struct list_head intr_entry; /** refcount */ atomic_t count; + /** Unique ID for the interrupt request */ + u64 intr_unique; + /* * The following bitfields are either set once before the * request is queued or setting/clearing them is protected by @@ -150,12 +163,18 @@ struct fuse_req { /** True if the request has reply */ unsigned isreply:1; - /** The request was interrupted */ - unsigned interrupted:1; + /** Force sending of the request even if interrupted */ + unsigned force:1; + + /** The request was aborted */ + unsigned aborted:1; /** Request is sent in the background */ unsigned background:1; + /** The request has been interrupted */ + unsigned interrupted:1; + /** Data is being copied to/from the request */ unsigned locked:1; @@ -181,6 +200,7 @@ struct fuse_req { struct fuse_init_in init_in; struct fuse_init_out init_out; struct fuse_read_in read_in; + struct fuse_lk_in lk_in; } misc; /** page vector */ @@ -192,17 +212,20 @@ struct fuse_req { /** offset of data on first page */ unsigned page_offset; - /** Inode used in the request */ - struct inode *inode; - - /** Second inode used in the request (or NULL) */ - struct inode *inode2; - /** File used in the request (or NULL) */ struct file *file; + /** vfsmount used in release */ + struct vfsmount *vfsmount; + + /** dentry used in release */ + struct dentry *dentry; + /** Request completion callback */ void (*end)(struct fuse_conn *, struct fuse_req *); + + /** Request is stolen from fuse_file->reserved_req */ + struct file *stolen_file; }; /** @@ -216,6 +239,9 @@ struct fuse_conn { /** Lock protecting accessess to members of this structure */ spinlock_t lock; + /** Refcount */ + atomic_t count; + /** The user id for this mount */ uid_t user_id; @@ -243,13 +269,12 @@ struct fuse_conn { /** The list of requests under I/O */ struct list_head io; - /** Requests put in the background (RELEASE or any other - interrupted request) */ - struct list_head background; - /** Number of requests currently in the background */ unsigned num_background; + /** Pending interrupts */ + struct list_head interrupts; + /** Flag indicating if connection is blocked. This will be the case before the INIT reply is received, and if there are too many outstading backgrounds requests */ @@ -258,15 +283,9 @@ struct fuse_conn { /** waitq for blocked connection */ wait_queue_head_t blocked_waitq; - /** RW semaphore for exclusion with fuse_put_super() */ - struct rw_semaphore sbput_sem; - /** The next unique request id */ u64 reqctr; - /** Mount is active */ - unsigned mounted; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -305,12 +324,18 @@ struct fuse_conn { /** Is removexattr not implemented by fs? */ unsigned no_removexattr : 1; + /** Are file locking primitives not implemented by fs? */ + unsigned no_lock : 1; + /** Is access not implemented by fs? */ unsigned no_access : 1; /** Is create not implemented by fs? */ unsigned no_create : 1; + /** Is interrupt not implemented by fs? */ + unsigned no_interrupt : 1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -320,11 +345,23 @@ struct fuse_conn { /** Backing dev info */ struct backing_dev_info bdi; - /** kobject */ - struct kobject kobj; + /** Entry on the fuse_conn_list */ + struct list_head entry; + + /** Unique ID */ + u64 id; + + /** Dentries in the control filesystem */ + struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES]; + + /** number of dentries used in the above array */ + int ctl_ndents; /** O_ASYNC requests */ struct fasync_struct *fasync; + + /** Key for lock owner ID scrambling */ + u32 scramble_key[4]; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -337,11 +374,6 @@ static inline struct fuse_conn *get_fuse_conn(struct inode *inode) return get_fuse_conn_super(inode->i_sb); } -static inline struct fuse_conn *get_fuse_conn_kobj(struct kobject *obj) -{ - return container_of(obj, struct fuse_conn, kobj); -} - static inline struct fuse_inode *get_fuse_inode(struct inode *inode) { return container_of(inode, struct fuse_inode, inode); @@ -383,12 +415,9 @@ void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file, struct fuse_file *ff, struct fuse_open_out *outarg); -/** - * Send a RELEASE request - */ -void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff, - u64 nodeid, struct inode *inode, int flags, int isdir); - +/** */ +struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, + int opcode); /** * Send RELEASE or RELEASEDIR request */ @@ -435,6 +464,9 @@ int fuse_dev_init(void); */ void fuse_dev_cleanup(void); +int fuse_ctl_init(void); +void fuse_ctl_cleanup(void); + /** * Allocate a request */ @@ -446,14 +478,14 @@ struct fuse_req *fuse_request_alloc(void); void fuse_request_free(struct fuse_req *req); /** - * Reinitialize a request, the preallocated flag is left unmodified + * Get a request, may fail with -ENOMEM */ -void fuse_reset_request(struct fuse_req *req); +struct fuse_req *fuse_get_req(struct fuse_conn *fc); /** - * Reserve a preallocated request + * Gets a requests for a file operation, always succeeds */ -struct fuse_req *fuse_get_req(struct fuse_conn *fc); +struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file); /** * Decrement reference count of a request. If count goes to zero free @@ -476,11 +508,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); */ void request_send_background(struct fuse_conn *fc, struct fuse_req *req); -/** - * Release inodes and file associated with background request - */ -void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req); - /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); @@ -493,3 +520,23 @@ int fuse_do_getattr(struct inode *inode); * Invalidate inode attributes */ void fuse_invalidate_attr(struct inode *inode); + +/** + * Acquire reference to fuse_conn + */ +struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); + +/** + * Release reference to fuse_conn + */ +void fuse_conn_put(struct fuse_conn *fc); + +/** + * Add connection to control filesystem + */ +int fuse_ctl_add_conn(struct fuse_conn *fc); + +/** + * Remove connection from control filesystem + */ +void fuse_ctl_remove_conn(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a13c0f529058..dcaaabd3b9c4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -11,25 +11,20 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/file.h> -#include <linux/mount.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/module.h> #include <linux/parser.h> #include <linux/statfs.h> +#include <linux/random.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); static kmem_cache_t *fuse_inode_cachep; -static struct subsystem connections_subsys; - -struct fuse_conn_attr { - struct attribute attr; - ssize_t (*show)(struct fuse_conn *, char *); - ssize_t (*store)(struct fuse_conn *, const char *, size_t); -}; +struct list_head fuse_conn_list; +DEFINE_MUTEX(fuse_mutex); #define FUSE_SUPER_MAGIC 0x65735546 @@ -104,6 +99,14 @@ static void fuse_clear_inode(struct inode *inode) } } +static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) +{ + if (*flags & MS_MANDLOCK) + return -EINVAL; + + return 0; +} + void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr) { if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size) @@ -195,31 +198,29 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid, return inode; } -static void fuse_umount_begin(struct super_block *sb) +static void fuse_umount_begin(struct vfsmount *vfsmnt, int flags) { - fuse_abort_conn(get_fuse_conn_super(sb)); + if (flags & MNT_FORCE) + fuse_abort_conn(get_fuse_conn_super(vfsmnt->mnt_sb)); } static void fuse_put_super(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); - down_write(&fc->sbput_sem); - while (!list_empty(&fc->background)) - fuse_release_background(fc, - list_entry(fc->background.next, - struct fuse_req, bg_entry)); - spin_lock(&fc->lock); - fc->mounted = 0; fc->connected = 0; + fc->blocked = 0; spin_unlock(&fc->lock); - up_write(&fc->sbput_sem); /* Flush all readers on this fs */ kill_fasync(&fc->fasync, SIGIO, POLL_IN); wake_up_all(&fc->waitq); - kobject_del(&fc->kobj); - kobject_put(&fc->kobj); + wake_up_all(&fc->blocked_waitq); + mutex_lock(&fuse_mutex); + list_del(&fc->entry); + fuse_ctl_remove_conn(fc); + mutex_unlock(&fuse_mutex); + fuse_conn_put(fc); } static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) @@ -369,11 +370,6 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } -static void fuse_conn_release(struct kobject *kobj) -{ - kfree(get_fuse_conn_kobj(kobj)); -} - static struct fuse_conn *new_conn(void) { struct fuse_conn *fc; @@ -381,24 +377,35 @@ static struct fuse_conn *new_conn(void) fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (fc) { spin_lock_init(&fc->lock); + atomic_set(&fc->count, 1); init_waitqueue_head(&fc->waitq); init_waitqueue_head(&fc->blocked_waitq); INIT_LIST_HEAD(&fc->pending); INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->background); - init_rwsem(&fc->sbput_sem); - kobj_set_kset_s(fc, connections_subsys); - kobject_init(&fc->kobj); + INIT_LIST_HEAD(&fc->interrupts); atomic_set(&fc->num_waiting, 0); fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; fc->reqctr = 0; fc->blocked = 1; + get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); } return fc; } +void fuse_conn_put(struct fuse_conn *fc) +{ + if (atomic_dec_and_test(&fc->count)) + kfree(fc); +} + +struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) +{ + atomic_inc(&fc->count); + return fc; +} + static struct inode *get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; @@ -414,6 +421,7 @@ static struct super_operations fuse_super_operations = { .destroy_inode = fuse_destroy_inode, .read_inode = fuse_read_inode, .clear_inode = fuse_clear_inode, + .remount_fs = fuse_remount_fs, .put_super = fuse_put_super, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, @@ -433,8 +441,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; if (arg->flags & FUSE_ASYNC_READ) fc->async_read = 1; - } else + if (!(arg->flags & FUSE_POSIX_LOCKS)) + fc->no_lock = 1; + } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; + fc->no_lock = 1; + } fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); fc->minor = arg->minor; @@ -452,7 +464,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->major = FUSE_KERNEL_VERSION; arg->minor = FUSE_KERNEL_MINOR_VERSION; arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; - arg->flags |= FUSE_ASYNC_READ; + arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -468,10 +480,9 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) request_send_background(fc, req); } -static unsigned long long conn_id(void) +static u64 conn_id(void) { - /* BKL is held for ->get_sb() */ - static unsigned long long ctr = 1; + static u64 ctr = 1; return ctr++; } @@ -485,6 +496,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) struct fuse_req *init_req; int err; + if (sb->s_flags & MS_MANDLOCK) + return -EINVAL; + if (!parse_fuse_opt((char *) data, &d)) return -EINVAL; @@ -528,25 +542,21 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!init_req) goto err_put_root; - err = kobject_set_name(&fc->kobj, "%llu", conn_id()); - if (err) - goto err_free_req; - - err = kobject_add(&fc->kobj); - if (err) - goto err_free_req; - - /* Setting file->private_data can't race with other mount() - instances, since BKL is held for ->get_sb() */ + mutex_lock(&fuse_mutex); err = -EINVAL; if (file->private_data) - goto err_kobject_del; + goto err_unlock; + fc->id = conn_id(); + err = fuse_ctl_add_conn(fc); + if (err) + goto err_unlock; + + list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - fc->mounted = 1; fc->connected = 1; - kobject_get(&fc->kobj); - file->private_data = fc; + file->private_data = fuse_conn_get(fc); + mutex_unlock(&fuse_mutex); /* * atomic_dec_and_test() in fput() provides the necessary * memory barrier for file->private_data to be visible on all @@ -558,15 +568,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) return 0; - err_kobject_del: - kobject_del(&fc->kobj); - err_free_req: + err_unlock: + mutex_unlock(&fuse_mutex); fuse_request_free(init_req); err_put_root: dput(root_dentry); err: fput(file); - kobject_put(&fc->kobj); + fuse_conn_put(fc); return err; } @@ -584,68 +593,8 @@ static struct file_system_type fuse_fs_type = { .kill_sb = kill_anon_super, }; -static ssize_t fuse_conn_waiting_show(struct fuse_conn *fc, char *page) -{ - return sprintf(page, "%i\n", atomic_read(&fc->num_waiting)); -} - -static ssize_t fuse_conn_abort_store(struct fuse_conn *fc, const char *page, - size_t count) -{ - fuse_abort_conn(fc); - return count; -} - -static struct fuse_conn_attr fuse_conn_waiting = - __ATTR(waiting, 0400, fuse_conn_waiting_show, NULL); -static struct fuse_conn_attr fuse_conn_abort = - __ATTR(abort, 0600, NULL, fuse_conn_abort_store); - -static struct attribute *fuse_conn_attrs[] = { - &fuse_conn_waiting.attr, - &fuse_conn_abort.attr, - NULL, -}; - -static ssize_t fuse_conn_attr_show(struct kobject *kobj, - struct attribute *attr, - char *page) -{ - struct fuse_conn_attr *fca = - container_of(attr, struct fuse_conn_attr, attr); - - if (fca->show) - return fca->show(get_fuse_conn_kobj(kobj), page); - else - return -EACCES; -} - -static ssize_t fuse_conn_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *page, size_t count) -{ - struct fuse_conn_attr *fca = - container_of(attr, struct fuse_conn_attr, attr); - - if (fca->store) - return fca->store(get_fuse_conn_kobj(kobj), page, count); - else - return -EACCES; -} - -static struct sysfs_ops fuse_conn_sysfs_ops = { - .show = &fuse_conn_attr_show, - .store = &fuse_conn_attr_store, -}; - -static struct kobj_type ktype_fuse_conn = { - .release = fuse_conn_release, - .sysfs_ops = &fuse_conn_sysfs_ops, - .default_attrs = fuse_conn_attrs, -}; - static decl_subsys(fuse, NULL, NULL); -static decl_subsys(connections, &ktype_fuse_conn, NULL); +static decl_subsys(connections, NULL, NULL); static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) @@ -719,6 +668,7 @@ static int __init fuse_init(void) printk("fuse init (API version %i.%i)\n", FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); + INIT_LIST_HEAD(&fuse_conn_list); res = fuse_fs_init(); if (res) goto err; @@ -731,8 +681,14 @@ static int __init fuse_init(void) if (res) goto err_dev_cleanup; + res = fuse_ctl_init(); + if (res) + goto err_sysfs_cleanup; + return 0; + err_sysfs_cleanup: + fuse_sysfs_cleanup(); err_dev_cleanup: fuse_dev_cleanup(); err_fs_cleanup: @@ -745,6 +701,7 @@ static void __exit fuse_exit(void) { printk(KERN_DEBUG "fuse exit\n"); + fuse_ctl_cleanup(); fuse_sysfs_cleanup(); fuse_fs_cleanup(); fuse_dev_cleanup(); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 7f96b5cb6781..8c9b28dff119 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -34,6 +34,7 @@ #include <linux/suspend.h> #include <linux/pagemap.h> #include <linux/kthread.h> +#include <linux/poison.h> #include <linux/proc_fs.h> #include <asm/uaccess.h> @@ -1675,7 +1676,7 @@ static void journal_free_journal_head(struct journal_head *jh) { #ifdef CONFIG_JBD_DEBUG atomic_dec(&nr_journal_heads); - memset(jh, 0x5b, sizeof(*jh)); + memset(jh, JBD_POISON_FREE, sizeof(*jh)); #endif kmem_cache_free(journal_head_cache, jh); } diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 80d7f53fd0a7..de5bafb4e853 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -531,6 +531,7 @@ static int do_one_pass(journal_t *journal, default: jbd_debug(3, "Unrecognised magic %d, end of scan.\n", blocktype); + brelse(bh); goto done; } } diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 1862e8bc101d..b8886f048eaa 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -53,8 +53,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, if (!instr) { printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -86,8 +85,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, /* Erase failed immediately. Refile it on the list */ D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret)); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -161,8 +159,7 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo { D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset)); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add_tail(&jeb->list, &c->erase_complete_list); + list_move_tail(&jeb->list, &c->erase_complete_list); spin_unlock(&c->erase_completion_lock); /* Ensure that kupdated calls us again to mark them clean */ jffs2_erase_pending_trigger(c); @@ -178,8 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { /* We'd like to give this block another try. */ spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -191,8 +187,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock spin_lock(&c->erase_completion_lock); c->erasing_size -= c->sector_size; c->bad_size += c->sector_size; - list_del(&jeb->list); - list_add(&jeb->list, &c->bad_list); + list_move(&jeb->list, &c->bad_list); c->nr_erasing_blocks--; spin_unlock(&c->erase_completion_lock); wake_up(&c->erase_wait); diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 8bedfd2ff689..ac0c350ed7d7 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -211,8 +211,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) struct jffs2_eraseblock *ejeb; ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); - list_del(&ejeb->list); - list_add_tail(&ejeb->list, &c->erase_pending_list); + list_move_tail(&ejeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_erase_pending_trigger(c); D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index 0b02fc79e4d1..be1acc3dad97 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -43,7 +43,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c) return -ENOMEM; } - dbg_summary("returned succesfully\n"); + dbg_summary("returned successfully\n"); return 0; } diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index a7f153f79ecb..b9b700730dfe 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -495,8 +495,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) /* Fix up the original jeb now it's on the bad_list */ if (first_raw == jeb->first_node) { D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_erase_pending_trigger(c); } diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 5549378358bf..4d52593a5fc6 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr) /* allocate the disk blocks for the extent. initially, extBalloc() * will try to allocate disk blocks for the requested size (xlen). - * if this fails (xlen contigious free blocks not avaliable), it'll + * if this fails (xlen contiguous free blocks not avaliable), it'll * try to allocate a smaller number of blocks (producing a smaller * extent), with this smaller number of blocks consisting of the * requested number of blocks rounded down to the next smaller @@ -493,7 +493,7 @@ int extFill(struct inode *ip, xad_t * xp) * * initially, we will try to allocate disk blocks for the * requested size (nblocks). if this fails (nblocks - * contigious free blocks not avaliable), we'll try to allocate + * contiguous free blocks not avaliable), we'll try to allocate * a smaller number of blocks (producing a smaller extent), with * this smaller number of blocks consisting of the requested * number of blocks rounded down to the next smaller power of 2 @@ -529,7 +529,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) /* get the number of blocks to initially attempt to allocate. * we'll first try the number of blocks requested unless this - * number is greater than the maximum number of contigious free + * number is greater than the maximum number of contiguous free * blocks in the map. in that case, we'll start off with the * maximum free. */ @@ -586,7 +586,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * in place. if this fails, we'll try to move the extent * to a new set of blocks. if moving the extent, we initially * will try to allocate disk blocks for the requested size - * (nnew). if this fails (nnew contigious free blocks not + * (nnew). if this fails (new contiguous free blocks not * avaliable), we'll try to allocate a smaller number of * blocks (producing a smaller extent), with this smaller * number of blocks consisting of the requested number of diff --git a/fs/libfs.c b/fs/libfs.c index 1b1156381787..ac02ea602c3d 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -149,10 +149,9 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) /* fallthrough */ default: spin_lock(&dcache_lock); - if (filp->f_pos == 2) { - list_del(q); - list_add(q, &dentry->d_subdirs); - } + if (filp->f_pos == 2) + list_move(q, &dentry->d_subdirs); + for (p=q->next; p != &dentry->d_subdirs; p=p->next) { struct dentry *next; next = list_entry(p, struct dentry, d_u.d_child); @@ -164,8 +163,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) return 0; spin_lock(&dcache_lock); /* next is still alive */ - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } @@ -424,13 +422,13 @@ out: static DEFINE_SPINLOCK(pin_fs_lock); -int simple_pin_fs(char *name, struct vfsmount **mount, int *count) +int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) { struct vfsmount *mnt = NULL; spin_lock(&pin_fs_lock); if (unlikely(!*mount)) { spin_unlock(&pin_fs_lock); - mnt = do_kern_mount(name, 0, name, NULL); + mnt = vfs_kern_mount(type, 0, type->name, NULL); if (IS_ERR(mnt)) return PTR_ERR(mnt); spin_lock(&pin_fs_lock); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index bce744468708..52774feab93f 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -147,11 +147,10 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number, * that we mark locks for reclaiming, and that we bump the pseudo NSM state. */ -static inline -void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate) +static void nlmclnt_prepare_reclaim(struct nlm_host *host) { + down_write(&host->h_rwsem); host->h_monitored = 0; - host->h_nsmstate = newstate; host->h_state++; host->h_nextrebind = 0; nlm_rebind_host(host); @@ -164,6 +163,13 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate) dprintk("NLM: reclaiming locks for host %s", host->h_name); } +static void nlmclnt_finish_reclaim(struct nlm_host *host) +{ + host->h_reclaiming = 0; + up_write(&host->h_rwsem); + dprintk("NLM: done reclaiming locks for host %s", host->h_name); +} + /* * Reclaim all locks on server host. We do this by spawning a separate * reclaimer thread. @@ -171,12 +177,10 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate) void nlmclnt_recovery(struct nlm_host *host, u32 newstate) { - if (host->h_reclaiming++) { - if (host->h_nsmstate == newstate) - return; - nlmclnt_prepare_reclaim(host, newstate); - } else { - nlmclnt_prepare_reclaim(host, newstate); + if (host->h_nsmstate == newstate) + return; + host->h_nsmstate = newstate; + if (!host->h_reclaiming++) { nlm_get_host(host); __module_get(THIS_MODULE); if (kernel_thread(reclaimer, host, CLONE_KERNEL) < 0) @@ -190,6 +194,7 @@ reclaimer(void *ptr) struct nlm_host *host = (struct nlm_host *) ptr; struct nlm_wait *block; struct file_lock *fl, *next; + u32 nsmstate; daemonize("%s-reclaim", host->h_name); allow_signal(SIGKILL); @@ -199,19 +204,25 @@ reclaimer(void *ptr) lock_kernel(); lockd_up(); + nlmclnt_prepare_reclaim(host); /* First, reclaim all locks that have been marked. */ restart: + nsmstate = host->h_nsmstate; list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) { list_del_init(&fl->fl_u.nfs_fl.list); if (signalled()) continue; - if (nlmclnt_reclaim(host, fl) == 0) - list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted); - goto restart; + if (nlmclnt_reclaim(host, fl) != 0) + continue; + list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted); + if (host->h_nsmstate != nsmstate) { + /* Argh! The server rebooted again! */ + list_splice_init(&host->h_granted, &host->h_reclaim); + goto restart; + } } - - host->h_reclaiming = 0; + nlmclnt_finish_reclaim(host); /* Now, wake up all processes that sleep on a blocked lock */ list_for_each_entry(block, &nlm_blocked, b_list) { diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index f96e38155b5c..4db62098d3f4 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -508,7 +508,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) } block = nlmclnt_prepare_block(host, fl); +again: for(;;) { + /* Reboot protection */ + fl->fl_u.nfs_fl.state = host->h_state; status = nlmclnt_call(req, NLMPROC_LOCK); if (status < 0) goto out_unblock; @@ -531,10 +534,16 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) } if (resp->status == NLM_LCK_GRANTED) { - fl->fl_u.nfs_fl.state = host->h_state; + down_read(&host->h_rwsem); + /* Check whether or not the server has rebooted */ + if (fl->fl_u.nfs_fl.state != host->h_state) { + up_read(&host->h_rwsem); + goto again; + } fl->fl_flags |= FL_SLEEP; /* Ensure the resulting lock will get added to granted list */ do_vfs_lock(fl); + up_read(&host->h_rwsem); } status = nlm_stat_to_errno(resp->status); out_unblock: @@ -596,6 +605,7 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl) static int nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) { + struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; int status; @@ -604,7 +614,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either * case, we want to unlock. */ + down_read(&host->h_rwsem); do_vfs_lock(fl); + up_read(&host->h_rwsem); if (req->a_flags & RPC_TASK_ASYNC) return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 729ac427d359..38b0e8a1aec0 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -112,11 +112,12 @@ nlm_lookup_host(int server, struct sockaddr_in *sin, host->h_version = version; host->h_proto = proto; host->h_rpcclnt = NULL; - init_MUTEX(&host->h_sema); + mutex_init(&host->h_mutex); host->h_nextrebind = jiffies + NLM_HOST_REBIND; host->h_expires = jiffies + NLM_HOST_EXPIRE; atomic_set(&host->h_count, 1); init_waitqueue_head(&host->h_gracewait); + init_rwsem(&host->h_rwsem); host->h_state = 0; /* pseudo NSM state */ host->h_nsmstate = 0; /* real NSM state */ host->h_server = server; @@ -172,7 +173,7 @@ nlm_bind_host(struct nlm_host *host) (unsigned)ntohl(host->h_addr.sin_addr.s_addr)); /* Lock host handle */ - down(&host->h_sema); + mutex_lock(&host->h_mutex); /* If we've already created an RPC client, check whether * RPC rebind is required @@ -204,12 +205,12 @@ nlm_bind_host(struct nlm_host *host) host->h_rpcclnt = clnt; } - up(&host->h_sema); + mutex_unlock(&host->h_mutex); return clnt; forgetit: printk("lockd: couldn't create RPC handle for %s\n", host->h_name); - up(&host->h_sema); + mutex_unlock(&host->h_mutex); return NULL; } diff --git a/fs/namei.c b/fs/namei.c index bb4a3e40e432..c784e8bb57a3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2243,14 +2243,16 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname, int error; char * to; - if (flags != 0) + if ((flags & ~AT_SYMLINK_FOLLOW) != 0) return -EINVAL; to = getname(newname); if (IS_ERR(to)) return PTR_ERR(to); - error = __user_walk_fd(olddfd, oldname, 0, &old_nd); + error = __user_walk_fd(olddfd, oldname, + flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0, + &old_nd); if (error) goto exit; error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd); diff --git a/fs/namespace.c b/fs/namespace.c index c13072a5f1ee..b3ed212ea416 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -526,10 +526,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) { struct vfsmount *p; - for (p = mnt; p; p = next_mnt(p, mnt)) { - list_del(&p->mnt_hash); - list_add(&p->mnt_hash, kill); - } + for (p = mnt; p; p = next_mnt(p, mnt)) + list_move(&p->mnt_hash, kill); if (propagate) propagate_umount(kill); @@ -585,8 +583,8 @@ static int do_umount(struct vfsmount *mnt, int flags) */ lock_kernel(); - if ((flags & MNT_FORCE) && sb->s_op->umount_begin) - sb->s_op->umount_begin(sb); + if (sb->s_op->umount_begin) + sb->s_op->umount_begin(mnt, flags); unlock_kernel(); /* @@ -1172,13 +1170,46 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts, } /* + * go through the vfsmounts we've just consigned to the graveyard to + * - check that they're still dead + * - delete the vfsmount from the appropriate namespace under lock + * - dispose of the corpse + */ +static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts) +{ + struct namespace *namespace; + struct vfsmount *mnt; + + while (!list_empty(graveyard)) { + LIST_HEAD(umounts); + mnt = list_entry(graveyard->next, struct vfsmount, mnt_expire); + list_del_init(&mnt->mnt_expire); + + /* don't do anything if the namespace is dead - all the + * vfsmounts from it are going away anyway */ + namespace = mnt->mnt_namespace; + if (!namespace || !namespace->root) + continue; + get_namespace(namespace); + + spin_unlock(&vfsmount_lock); + down_write(&namespace_sem); + expire_mount(mnt, mounts, &umounts); + up_write(&namespace_sem); + release_mounts(&umounts); + mntput(mnt); + put_namespace(namespace); + spin_lock(&vfsmount_lock); + } +} + +/* * process a list of expirable mountpoints with the intent of discarding any * mountpoints that aren't in use and haven't been touched since last we came * here */ void mark_mounts_for_expiry(struct list_head *mounts) { - struct namespace *namespace; struct vfsmount *mnt, *next; LIST_HEAD(graveyard); @@ -1202,38 +1233,79 @@ void mark_mounts_for_expiry(struct list_head *mounts) list_move(&mnt->mnt_expire, &graveyard); } - /* - * go through the vfsmounts we've just consigned to the graveyard to - * - check that they're still dead - * - delete the vfsmount from the appropriate namespace under lock - * - dispose of the corpse - */ - while (!list_empty(&graveyard)) { - LIST_HEAD(umounts); - mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire); - list_del_init(&mnt->mnt_expire); + expire_mount_list(&graveyard, mounts); - /* don't do anything if the namespace is dead - all the - * vfsmounts from it are going away anyway */ - namespace = mnt->mnt_namespace; - if (!namespace || !namespace->root) + spin_unlock(&vfsmount_lock); +} + +EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); + +/* + * Ripoff of 'select_parent()' + * + * search the list of submounts for a given mountpoint, and move any + * shrinkable submounts to the 'graveyard' list. + */ +static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) +{ + struct vfsmount *this_parent = parent; + struct list_head *next; + int found = 0; + +repeat: + next = this_parent->mnt_mounts.next; +resume: + while (next != &this_parent->mnt_mounts) { + struct list_head *tmp = next; + struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); + + next = tmp->next; + if (!(mnt->mnt_flags & MNT_SHRINKABLE)) continue; - get_namespace(namespace); + /* + * Descend a level if the d_mounts list is non-empty. + */ + if (!list_empty(&mnt->mnt_mounts)) { + this_parent = mnt; + goto repeat; + } - spin_unlock(&vfsmount_lock); - down_write(&namespace_sem); - expire_mount(mnt, mounts, &umounts); - up_write(&namespace_sem); - release_mounts(&umounts); - mntput(mnt); - put_namespace(namespace); - spin_lock(&vfsmount_lock); + if (!propagate_mount_busy(mnt, 1)) { + mntget(mnt); + list_move_tail(&mnt->mnt_expire, graveyard); + found++; + } + } + /* + * All done at this level ... ascend and resume the search + */ + if (this_parent != parent) { + next = this_parent->mnt_child.next; + this_parent = this_parent->mnt_parent; + goto resume; } + return found; +} + +/* + * process a list of expirable mountpoints with the intent of discarding any + * submounts of a specific parent mountpoint + */ +void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts) +{ + LIST_HEAD(graveyard); + int found; + + spin_lock(&vfsmount_lock); + + /* extract submounts of 'mountpoint' from the expiration list */ + while ((found = select_submounts(mountpoint, &graveyard)) != 0) + expire_mount_list(&graveyard, mounts); spin_unlock(&vfsmount_lock); } -EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); +EXPORT_SYMBOL_GPL(shrink_submounts); /* * Some copy_from_user() implementations do not return the exact number of diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index ec61fd56a1a9..0b572a0c1967 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -4,14 +4,16 @@ obj-$(CONFIG_NFS_FS) += nfs.o -nfs-y := dir.o file.o inode.o nfs2xdr.o pagelist.o \ - proc.o read.o symlink.o unlink.o write.o +nfs-y := dir.o file.o inode.o super.o nfs2xdr.o pagelist.o \ + proc.o read.o symlink.o unlink.o write.o \ + namespace.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ delegation.o idmap.o \ - callback.o callback_xdr.o callback_proc.o + callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o nfs-$(CONFIG_NFS_DIRECTIO) += direct.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-objs := $(nfs-y) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 90c95adc8c1b..d53f8c6a9ecb 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -182,8 +182,6 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) /* * Define NFS4 callback program */ -extern struct svc_version nfs4_callback_version1; - static struct svc_version *nfs4_callback_version[] = { [1] = &nfs4_callback_version1, }; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 05c38cf40b69..c92991328d9a 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -202,7 +202,7 @@ static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xd status = decode_fh(xdr, &args->fh); out: dprintk("%s: exit with status = %d\n", __FUNCTION__, status); - return 0; + return status; } static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index cae74dd4c7f5..3ddda6f7ecc2 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -528,7 +528,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) lock_kernel(); - res = nfs_revalidate_inode(NFS_SERVER(inode), inode); + res = nfs_revalidate_mapping(inode, filp->f_mapping); if (res < 0) { unlock_kernel(); return res; @@ -868,6 +868,17 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) return (nd->intent.open.flags & O_EXCL) != 0; } +static inline int nfs_reval_fsid(struct inode *dir, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + struct nfs_server *server = NFS_SERVER(dir); + + if (!nfs_fsid_equal(&server->fsid, &fattr->fsid)) + /* Revalidate fsid on root dir */ + return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode); + return 0; +} + static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { struct dentry *res; @@ -900,6 +911,11 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru res = ERR_PTR(error); goto out_unlock; } + error = nfs_reval_fsid(dir, &fhandle, &fattr); + if (error < 0) { + res = ERR_PTR(error); + goto out_unlock; + } inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); res = (struct dentry *)inode; if (IS_ERR(res)) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 3c72b0c07283..8ca9707be6c9 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -892,7 +892,7 @@ out: * nfs_init_directcache - create a slab cache for nfs_direct_req structures * */ -int nfs_init_directcache(void) +int __init nfs_init_directcache(void) { nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", sizeof(struct nfs_direct_req), @@ -906,7 +906,7 @@ int nfs_init_directcache(void) } /** - * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures + * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures * */ void nfs_destroy_directcache(void) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index fa05c027ea11..add289138836 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -127,23 +127,6 @@ nfs_file_release(struct inode *inode, struct file *filp) } /** - * nfs_revalidate_file - Revalidate the page cache & related metadata - * @inode - pointer to inode struct - * @file - pointer to file - */ -static int nfs_revalidate_file(struct inode *inode, struct file *filp) -{ - struct nfs_inode *nfsi = NFS_I(inode); - int retval = 0; - - if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR)) - || nfs_attribute_timeout(inode)) - retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - nfs_revalidate_mapping(inode, filp->f_mapping); - return 0; -} - -/** * nfs_revalidate_size - Revalidate the file size * @inode - pointer to inode struct * @file - pointer to struct file @@ -228,7 +211,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos) dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long) pos); - result = nfs_revalidate_file(inode, iocb->ki_filp); + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count); if (!result) result = generic_file_aio_read(iocb, buf, count, pos); @@ -247,7 +230,7 @@ nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count, dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_file(inode, filp); + res = nfs_revalidate_mapping(inode, filp->f_mapping); if (!res) res = generic_file_sendfile(filp, ppos, count, actor, target); return res; @@ -263,7 +246,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) dfprintk(VFS, "nfs: mmap(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); - status = nfs_revalidate_file(inode, file); + status = nfs_revalidate_mapping(inode, file->f_mapping); if (!status) status = generic_file_mmap(file, vma); return status; @@ -320,7 +303,11 @@ static int nfs_commit_write(struct file *file, struct page *page, unsigned offse static void nfs_invalidate_page(struct page *page, unsigned long offset) { - /* FIXME: we really should cancel any unstarted writes on this page */ + struct inode *inode = page->mapping->host; + + /* Cancel any unstarted writes on this page */ + if (offset == 0) + nfs_sync_inode_wait(inode, page->index, 1, FLUSH_INVALIDATE); } static int nfs_release_page(struct page *page, gfp_t gfp) @@ -373,7 +360,6 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t if (result) goto out; } - nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); result = count; if (!count) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 3fab5b0cfc5a..b81e7ed3c902 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -47,7 +47,6 @@ #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> -#include <linux/nfs_fs_sb.h> #include <linux/nfs_fs.h> #include <linux/nfs_idmap.h> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 937fbfc381bb..c5b916605fb0 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -36,6 +36,8 @@ #include <linux/mount.h> #include <linux/nfs_idmap.h> #include <linux/vfs.h> +#include <linux/inet.h> +#include <linux/nfs_xdr.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -44,89 +46,17 @@ #include "callback.h" #include "delegation.h" #include "iostat.h" +#include "internal.h" #define NFSDBG_FACILITY NFSDBG_VFS #define NFS_PARANOIA 1 -/* Maximum number of readahead requests - * FIXME: this should really be a sysctl so that users may tune it to suit - * their needs. People that do NFS over a slow network, might for - * instance want to reduce it to something closer to 1 for improved - * interactive response. - */ -#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) - static void nfs_invalidate_inode(struct inode *); static int nfs_update_inode(struct inode *, struct nfs_fattr *); -static struct inode *nfs_alloc_inode(struct super_block *sb); -static void nfs_destroy_inode(struct inode *); -static int nfs_write_inode(struct inode *,int); -static void nfs_delete_inode(struct inode *); -static void nfs_clear_inode(struct inode *); -static void nfs_umount_begin(struct super_block *); -static int nfs_statfs(struct dentry *, struct kstatfs *); -static int nfs_show_options(struct seq_file *, struct vfsmount *); -static int nfs_show_stats(struct seq_file *, struct vfsmount *); static void nfs_zap_acl_cache(struct inode *); -static struct rpc_program nfs_program; - -static struct super_operations nfs_sops = { - .alloc_inode = nfs_alloc_inode, - .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, - .delete_inode = nfs_delete_inode, - .statfs = nfs_statfs, - .clear_inode = nfs_clear_inode, - .umount_begin = nfs_umount_begin, - .show_options = nfs_show_options, - .show_stats = nfs_show_stats, -}; - -/* - * RPC cruft for NFS - */ -static struct rpc_stat nfs_rpcstat = { - .program = &nfs_program -}; -static struct rpc_version * nfs_version[] = { - NULL, - NULL, - &nfs_version2, -#if defined(CONFIG_NFS_V3) - &nfs_version3, -#elif defined(CONFIG_NFS_V4) - NULL, -#endif -#if defined(CONFIG_NFS_V4) - &nfs_version4, -#endif -}; - -static struct rpc_program nfs_program = { - .name = "nfs", - .number = NFS_PROGRAM, - .nrvers = ARRAY_SIZE(nfs_version), - .version = nfs_version, - .stats = &nfs_rpcstat, - .pipe_dir_name = "/nfs", -}; - -#ifdef CONFIG_NFS_V3_ACL -static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; -static struct rpc_version * nfsacl_version[] = { - [3] = &nfsacl_version3, -}; - -struct rpc_program nfsacl_program = { - .name = "nfsacl", - .number = NFS_ACL_PROGRAM, - .nrvers = ARRAY_SIZE(nfsacl_version), - .version = nfsacl_version, - .stats = &nfsacl_rpcstat, -}; -#endif /* CONFIG_NFS_V3_ACL */ +static kmem_cache_t * nfs_inode_cachep; static inline unsigned long nfs_fattr_to_ino_t(struct nfs_fattr *fattr) @@ -134,8 +64,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -static int -nfs_write_inode(struct inode *inode, int sync) +int nfs_write_inode(struct inode *inode, int sync) { int flags = sync ? FLUSH_SYNC : 0; int ret; @@ -146,31 +75,15 @@ nfs_write_inode(struct inode *inode, int sync) return 0; } -static void -nfs_delete_inode(struct inode * inode) +void nfs_clear_inode(struct inode *inode) { - dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); - - truncate_inode_pages(&inode->i_data, 0); + struct nfs_inode *nfsi = NFS_I(inode); + struct rpc_cred *cred; - nfs_wb_all(inode); /* * The following should never happen... */ - if (nfs_have_writebacks(inode)) { - printk(KERN_ERR "nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino); - } - - clear_inode(inode); -} - -static void -nfs_clear_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - struct rpc_cred *cred; - - nfs_wb_all(inode); + BUG_ON(nfs_have_writebacks(inode)); BUG_ON (!list_empty(&nfsi->open_files)); nfs_zap_acl_cache(inode); cred = nfsi->cache_access.cred; @@ -179,555 +92,6 @@ nfs_clear_inode(struct inode *inode) BUG_ON(atomic_read(&nfsi->data_updates) != 0); } -void -nfs_umount_begin(struct super_block *sb) -{ - struct rpc_clnt *rpc = NFS_SB(sb)->client; - - /* -EIO all pending I/O */ - if (!IS_ERR(rpc)) - rpc_killall_tasks(rpc); - rpc = NFS_SB(sb)->client_acl; - if (!IS_ERR(rpc)) - rpc_killall_tasks(rpc); -} - - -static inline unsigned long -nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) -{ - /* make sure blocksize is a power of two */ - if ((bsize & (bsize - 1)) || nrbitsp) { - unsigned char nrbits; - - for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--) - ; - bsize = 1 << nrbits; - if (nrbitsp) - *nrbitsp = nrbits; - } - - return bsize; -} - -/* - * Calculate the number of 512byte blocks used. - */ -static inline unsigned long -nfs_calc_block_size(u64 tsize) -{ - loff_t used = (tsize + 511) >> 9; - return (used > ULONG_MAX) ? ULONG_MAX : used; -} - -/* - * Compute and set NFS server blocksize - */ -static inline unsigned long -nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) -{ - if (bsize < NFS_MIN_FILE_IO_SIZE) - bsize = NFS_DEF_FILE_IO_SIZE; - else if (bsize >= NFS_MAX_FILE_IO_SIZE) - bsize = NFS_MAX_FILE_IO_SIZE; - - return nfs_block_bits(bsize, nrbitsp); -} - -/* - * Obtain the root inode of the file system. - */ -static struct inode * -nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo) -{ - struct nfs_server *server = NFS_SB(sb); - int error; - - error = server->rpc_ops->getroot(server, rootfh, fsinfo); - if (error < 0) { - dprintk("nfs_get_root: getattr error = %d\n", -error); - return ERR_PTR(error); - } - - return nfs_fhget(sb, rootfh, fsinfo->fattr); -} - -/* - * Do NFS version-independent mount processing, and sanity checking - */ -static int -nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor) -{ - struct nfs_server *server; - struct inode *root_inode; - struct nfs_fattr fattr; - struct nfs_fsinfo fsinfo = { - .fattr = &fattr, - }; - struct nfs_pathconf pathinfo = { - .fattr = &fattr, - }; - int no_root_error = 0; - unsigned long max_rpc_payload; - - /* We probably want something more informative here */ - snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev)); - - server = NFS_SB(sb); - - sb->s_magic = NFS_SUPER_MAGIC; - - server->io_stats = nfs_alloc_iostats(); - if (server->io_stats == NULL) - return -ENOMEM; - - root_inode = nfs_get_root(sb, &server->fh, &fsinfo); - /* Did getting the root inode fail? */ - if (IS_ERR(root_inode)) { - no_root_error = PTR_ERR(root_inode); - goto out_no_root; - } - sb->s_root = d_alloc_root(root_inode); - if (!sb->s_root) { - no_root_error = -ENOMEM; - goto out_no_root; - } - sb->s_root->d_op = server->rpc_ops->dentry_ops; - - /* mount time stamp, in seconds */ - server->mount_time = jiffies; - - /* Get some general file system info */ - if (server->namelen == 0 && - server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0) - server->namelen = pathinfo.max_namelen; - /* Work out a lot of parameters */ - if (server->rsize == 0) - server->rsize = nfs_block_size(fsinfo.rtpref, NULL); - if (server->wsize == 0) - server->wsize = nfs_block_size(fsinfo.wtpref, NULL); - - if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax) - server->rsize = nfs_block_size(fsinfo.rtmax, NULL); - if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax) - server->wsize = nfs_block_size(fsinfo.wtmax, NULL); - - max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); - if (server->rsize > max_rpc_payload) - server->rsize = max_rpc_payload; - if (server->rsize > NFS_MAX_FILE_IO_SIZE) - server->rsize = NFS_MAX_FILE_IO_SIZE; - server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - if (server->wsize > max_rpc_payload) - server->wsize = max_rpc_payload; - if (server->wsize > NFS_MAX_FILE_IO_SIZE) - server->wsize = NFS_MAX_FILE_IO_SIZE; - server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - if (sb->s_blocksize == 0) - sb->s_blocksize = nfs_block_bits(server->wsize, - &sb->s_blocksize_bits); - server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL); - - server->dtsize = nfs_block_size(fsinfo.dtpref, NULL); - if (server->dtsize > PAGE_CACHE_SIZE) - server->dtsize = PAGE_CACHE_SIZE; - if (server->dtsize > server->rsize) - server->dtsize = server->rsize; - - if (server->flags & NFS_MOUNT_NOAC) { - server->acregmin = server->acregmax = 0; - server->acdirmin = server->acdirmax = 0; - sb->s_flags |= MS_SYNCHRONOUS; - } - server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; - - sb->s_maxbytes = fsinfo.maxfilesize; - if (sb->s_maxbytes > MAX_LFS_FILESIZE) - sb->s_maxbytes = MAX_LFS_FILESIZE; - - server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0; - server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0; - - /* We're airborne Set socket buffersize */ - rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); - return 0; - /* Yargs. It didn't work out. */ -out_no_root: - dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error); - if (!IS_ERR(root_inode)) - iput(root_inode); - return no_root_error; -} - -static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans) -{ - to->to_initval = timeo * HZ / 10; - to->to_retries = retrans; - if (!to->to_retries) - to->to_retries = 2; - - switch (proto) { - case IPPROTO_TCP: - if (!to->to_initval) - to->to_initval = 60 * HZ; - if (to->to_initval > NFS_MAX_TCP_TIMEOUT) - to->to_initval = NFS_MAX_TCP_TIMEOUT; - to->to_increment = to->to_initval; - to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); - to->to_exponential = 0; - break; - case IPPROTO_UDP: - default: - if (!to->to_initval) - to->to_initval = 11 * HZ / 10; - if (to->to_initval > NFS_MAX_UDP_TIMEOUT) - to->to_initval = NFS_MAX_UDP_TIMEOUT; - to->to_maxval = NFS_MAX_UDP_TIMEOUT; - to->to_exponential = 1; - break; - } -} - -/* - * Create an RPC client handle. - */ -static struct rpc_clnt * -nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data) -{ - struct rpc_timeout timeparms; - struct rpc_xprt *xprt = NULL; - struct rpc_clnt *clnt = NULL; - int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP; - - nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans); - - server->retrans_timeo = timeparms.to_initval; - server->retrans_count = timeparms.to_retries; - - /* create transport and client */ - xprt = xprt_create_proto(proto, &server->addr, &timeparms); - if (IS_ERR(xprt)) { - dprintk("%s: cannot create RPC transport. Error = %ld\n", - __FUNCTION__, PTR_ERR(xprt)); - return (struct rpc_clnt *)xprt; - } - clnt = rpc_create_client(xprt, server->hostname, &nfs_program, - server->rpc_ops->version, data->pseudoflavor); - if (IS_ERR(clnt)) { - dprintk("%s: cannot create RPC client. Error = %ld\n", - __FUNCTION__, PTR_ERR(xprt)); - goto out_fail; - } - - clnt->cl_intr = 1; - clnt->cl_softrtry = 1; - - return clnt; - -out_fail: - return clnt; -} - -/* - * The way this works is that the mount process passes a structure - * in the data argument which contains the server's IP address - * and the root file handle obtained from the server's mount - * daemon. We stash these away in the private superblock fields. - */ -static int -nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent) -{ - struct nfs_server *server; - rpc_authflavor_t authflavor; - - server = NFS_SB(sb); - sb->s_blocksize_bits = 0; - sb->s_blocksize = 0; - if (data->bsize) - sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); - if (data->rsize) - server->rsize = nfs_block_size(data->rsize, NULL); - if (data->wsize) - server->wsize = nfs_block_size(data->wsize, NULL); - server->flags = data->flags & NFS_MOUNT_FLAGMASK; - - server->acregmin = data->acregmin*HZ; - server->acregmax = data->acregmax*HZ; - server->acdirmin = data->acdirmin*HZ; - server->acdirmax = data->acdirmax*HZ; - - /* Start lockd here, before we might error out */ - if (!(server->flags & NFS_MOUNT_NONLM)) - lockd_up(); - - server->namelen = data->namlen; - server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL); - if (!server->hostname) - return -ENOMEM; - strcpy(server->hostname, data->hostname); - - /* Check NFS protocol revision and initialize RPC op vector - * and file handle pool. */ -#ifdef CONFIG_NFS_V3 - if (server->flags & NFS_MOUNT_VER3) { - server->rpc_ops = &nfs_v3_clientops; - server->caps |= NFS_CAP_READDIRPLUS; - } else { - server->rpc_ops = &nfs_v2_clientops; - } -#else - server->rpc_ops = &nfs_v2_clientops; -#endif - - /* Fill in pseudoflavor for mount version < 5 */ - if (!(data->flags & NFS_MOUNT_SECFLAVOUR)) - data->pseudoflavor = RPC_AUTH_UNIX; - authflavor = data->pseudoflavor; /* save for sb_init() */ - /* XXX maybe we want to add a server->pseudoflavor field */ - - /* Create RPC client handles */ - server->client = nfs_create_client(server, data); - if (IS_ERR(server->client)) - return PTR_ERR(server->client); - /* RFC 2623, sec 2.3.2 */ - if (authflavor != RPC_AUTH_UNIX) { - struct rpc_auth *auth; - - server->client_sys = rpc_clone_client(server->client); - if (IS_ERR(server->client_sys)) - return PTR_ERR(server->client_sys); - auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys); - if (IS_ERR(auth)) - return PTR_ERR(auth); - } else { - atomic_inc(&server->client->cl_count); - server->client_sys = server->client; - } - if (server->flags & NFS_MOUNT_VER3) { -#ifdef CONFIG_NFS_V3_ACL - if (!(server->flags & NFS_MOUNT_NOACL)) { - server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); - /* No errors! Assume that Sun nfsacls are supported */ - if (!IS_ERR(server->client_acl)) - server->caps |= NFS_CAP_ACLS; - } -#else - server->flags &= ~NFS_MOUNT_NOACL; -#endif /* CONFIG_NFS_V3_ACL */ - /* - * The VFS shouldn't apply the umask to mode bits. We will - * do so ourselves when necessary. - */ - sb->s_flags |= MS_POSIXACL; - if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) - server->namelen = NFS3_MAXNAMLEN; - sb->s_time_gran = 1; - } else { - if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) - server->namelen = NFS2_MAXNAMLEN; - } - - sb->s_op = &nfs_sops; - return nfs_sb_init(sb, authflavor); -} - -static int -nfs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_sb; - struct nfs_server *server = NFS_SB(sb); - unsigned char blockbits; - unsigned long blockres; - struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode); - struct nfs_fattr fattr; - struct nfs_fsstat res = { - .fattr = &fattr, - }; - int error; - - lock_kernel(); - - error = server->rpc_ops->statfs(server, rootfh, &res); - buf->f_type = NFS_SUPER_MAGIC; - if (error < 0) - goto out_err; - - /* - * Current versions of glibc do not correctly handle the - * case where f_frsize != f_bsize. Eventually we want to - * report the value of wtmult in this field. - */ - buf->f_frsize = sb->s_blocksize; - - /* - * On most *nix systems, f_blocks, f_bfree, and f_bavail - * are reported in units of f_frsize. Linux hasn't had - * an f_frsize field in its statfs struct until recently, - * thus historically Linux's sys_statfs reports these - * fields in units of f_bsize. - */ - buf->f_bsize = sb->s_blocksize; - blockbits = sb->s_blocksize_bits; - blockres = (1 << blockbits) - 1; - buf->f_blocks = (res.tbytes + blockres) >> blockbits; - buf->f_bfree = (res.fbytes + blockres) >> blockbits; - buf->f_bavail = (res.abytes + blockres) >> blockbits; - - buf->f_files = res.tfiles; - buf->f_ffree = res.afiles; - - buf->f_namelen = server->namelen; - out: - unlock_kernel(); - return 0; - - out_err: - dprintk("%s: statfs error = %d\n", __FUNCTION__, -error); - buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; - goto out; - -} - -static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) -{ - static struct proc_nfs_info { - int flag; - char *str; - char *nostr; - } nfs_info[] = { - { NFS_MOUNT_SOFT, ",soft", ",hard" }, - { NFS_MOUNT_INTR, ",intr", "" }, - { NFS_MOUNT_NOCTO, ",nocto", "" }, - { NFS_MOUNT_NOAC, ",noac", "" }, - { NFS_MOUNT_NONLM, ",nolock", "" }, - { NFS_MOUNT_NOACL, ",noacl", "" }, - { 0, NULL, NULL } - }; - struct proc_nfs_info *nfs_infop; - char buf[12]; - char *proto; - - seq_printf(m, ",vers=%d", nfss->rpc_ops->version); - seq_printf(m, ",rsize=%d", nfss->rsize); - seq_printf(m, ",wsize=%d", nfss->wsize); - if (nfss->acregmin != 3*HZ || showdefaults) - seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ); - if (nfss->acregmax != 60*HZ || showdefaults) - seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ); - if (nfss->acdirmin != 30*HZ || showdefaults) - seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ); - if (nfss->acdirmax != 60*HZ || showdefaults) - seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ); - for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { - if (nfss->flags & nfs_infop->flag) - seq_puts(m, nfs_infop->str); - else - seq_puts(m, nfs_infop->nostr); - } - switch (nfss->client->cl_xprt->prot) { - case IPPROTO_TCP: - proto = "tcp"; - break; - case IPPROTO_UDP: - proto = "udp"; - break; - default: - snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot); - proto = buf; - } - seq_printf(m, ",proto=%s", proto); - seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ); - seq_printf(m, ",retrans=%u", nfss->retrans_count); -} - -static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) -{ - struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); - - nfs_show_mount_options(m, nfss, 0); - - seq_puts(m, ",addr="); - seq_escape(m, nfss->hostname, " \t\n\\"); - - return 0; -} - -static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) -{ - int i, cpu; - struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); - struct rpc_auth *auth = nfss->client->cl_auth; - struct nfs_iostats totals = { }; - - seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS); - - /* - * Display all mount option settings - */ - seq_printf(m, "\n\topts:\t"); - seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw"); - seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); - seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : ""); - seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); - nfs_show_mount_options(m, nfss, 1); - - seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); - - seq_printf(m, "\n\tcaps:\t"); - seq_printf(m, "caps=0x%x", nfss->caps); - seq_printf(m, ",wtmult=%d", nfss->wtmult); - seq_printf(m, ",dtsize=%d", nfss->dtsize); - seq_printf(m, ",bsize=%d", nfss->bsize); - seq_printf(m, ",namelen=%d", nfss->namelen); - -#ifdef CONFIG_NFS_V4 - if (nfss->rpc_ops->version == 4) { - seq_printf(m, "\n\tnfsv4:\t"); - seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); - seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); - seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); - } -#endif - - /* - * Display security flavor in effect for this mount - */ - seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor); - if (auth->au_flavor) - seq_printf(m, ",pseudoflavor=%d", auth->au_flavor); - - /* - * Display superblock I/O counters - */ - for_each_possible_cpu(cpu) { - struct nfs_iostats *stats; - - preempt_disable(); - stats = per_cpu_ptr(nfss->io_stats, cpu); - - for (i = 0; i < __NFSIOS_COUNTSMAX; i++) - totals.events[i] += stats->events[i]; - for (i = 0; i < __NFSIOS_BYTESMAX; i++) - totals.bytes[i] += stats->bytes[i]; - - preempt_enable(); - } - - seq_printf(m, "\n\tevents:\t"); - for (i = 0; i < __NFSIOS_COUNTSMAX; i++) - seq_printf(m, "%lu ", totals.events[i]); - seq_printf(m, "\n\tbytes:\t"); - for (i = 0; i < __NFSIOS_BYTESMAX; i++) - seq_printf(m, "%Lu ", totals.bytes[i]); - seq_printf(m, "\n"); - - rpc_print_iostats(m, nfss->client); - - return 0; -} - /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk */ @@ -890,6 +254,14 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) && fattr->size <= NFS_LIMIT_READDIRPLUS) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode)); + /* Deal with crossing mountpoints */ + if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { + if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) + inode->i_op = &nfs_referral_inode_operations; + else + inode->i_op = &nfs_mountpoint_inode_operations; + inode->i_fop = NULL; + } } else if (S_ISLNK(inode->i_mode)) inode->i_op = &nfs_symlink_inode_operations; else @@ -1208,6 +580,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); lock_kernel(); if (!inode || is_bad_inode(inode)) goto out_nowait; @@ -1221,7 +594,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = -ESTALE; /* Do we trust the cached ESTALE? */ if (NFS_ATTRTIMEO(inode) != 0) { - if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) { + if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) { /* no */ } else goto out; @@ -1252,8 +625,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) } spin_unlock(&inode->i_lock); - nfs_revalidate_mapping(inode, inode->i_mapping); - if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); @@ -1287,8 +658,7 @@ int nfs_attribute_timeout(struct inode *inode) */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { - nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); - if (!(NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) + if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) && !nfs_attribute_timeout(inode)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); @@ -1299,9 +669,16 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) * @inode - pointer to host inode * @mapping - pointer to mapping */ -void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); + int ret = 0; + + if (NFS_STALE(inode)) + ret = -ESTALE; + if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + || nfs_attribute_timeout(inode)) + ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); @@ -1322,6 +699,7 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) inode->i_sb->s_id, (long long)NFS_FILEID(inode)); } + return ret; } /** @@ -1361,12 +739,6 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); - if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0 - && nfsi->change_attr == fattr->pre_change_attr) { - nfsi->change_attr = fattr->change_attr; - nfsi->cache_change_attribute = jiffies; - } - /* If we have atomic WCC data, we may update some attributes */ if ((fattr->valid & NFS_ATTR_WCC) != 0) { if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { @@ -1400,9 +772,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat int data_unstable; - if ((fattr->valid & NFS_ATTR_FATTR) == 0) - return 0; - /* Has the inode gone and changed behind our back? */ if (nfsi->fileid != fattr->fileid || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { @@ -1415,20 +784,13 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0) { - if (nfsi->change_attr == fattr->change_attr) - goto out; - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; - if (!data_unstable) - nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; - } + if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && + nfsi->change_attr != fattr->change_attr) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; /* Verify a few of the more important attributes */ - if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; - if (!data_unstable) - nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; - } + if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); @@ -1445,7 +807,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (inode->i_nlink != fattr->nlink) nfsi->cache_validity |= NFS_INO_INVALID_ATTR; -out: if (!timespec_equal(&inode->i_atime, &fattr->atime)) nfsi->cache_validity |= NFS_INO_INVALID_ATIME; @@ -1471,7 +832,6 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; spin_lock(&inode->i_lock); - nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; if (time_after(fattr->time_start, nfsi->last_updated)) status = nfs_update_inode(inode, fattr); else @@ -1496,7 +856,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) spin_lock(&inode->i_lock); if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) { - nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; + nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; goto out; } status = nfs_update_inode(inode, fattr); @@ -1519,6 +879,7 @@ out: */ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) { + struct nfs_server *server; struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_isize, new_isize; unsigned int invalid = 0; @@ -1528,9 +889,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) __FUNCTION__, inode->i_sb->s_id, inode->i_ino, atomic_read(&inode->i_count), fattr->valid); - if ((fattr->valid & NFS_ATTR_FATTR) == 0) - return 0; - if (nfsi->fileid != fattr->fileid) goto out_fileid; @@ -1540,6 +898,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) goto out_changed; + server = NFS_SERVER(inode); + /* Update the fsid if and only if this is the root directory */ + if (inode == inode->i_sb->s_root->d_inode + && !nfs_fsid_equal(&server->fsid, &fattr->fsid)) + server->fsid = fattr->fsid; + /* * Update the read time so we don't revalidate too often. */ @@ -1549,7 +913,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Are we racing with known updates of the metadata on the server? */ data_stable = nfs_verify_change_attribute(inode, fattr->time_start); if (data_stable) - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); @@ -1613,15 +977,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_blksize = fattr->du.nfs2.blocksize; } - if ((fattr->valid & NFS_ATTR_FATTR_V4)) { - if (nfsi->change_attr != fattr->change_attr) { - dprintk("NFS: change_attr change on server for file %s/%ld\n", - inode->i_sb->s_id, inode->i_ino); - nfsi->change_attr = fattr->change_attr; - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - nfsi->cache_change_attribute = jiffies; - } else - invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA); + if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && + nfsi->change_attr != fattr->change_attr) { + dprintk("NFS: change_attr change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); + nfsi->change_attr = fattr->change_attr; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + nfsi->cache_change_attribute = jiffies; } /* Update attrtimeo value if we're out of the unstable period */ @@ -1669,202 +1031,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) goto out_err; } -/* - * File system information - */ - -static int nfs_set_super(struct super_block *s, void *data) -{ - s->s_fs_info = data; - return set_anon_super(s, data); -} - -static int nfs_compare_super(struct super_block *sb, void *data) -{ - struct nfs_server *server = data; - struct nfs_server *old = NFS_SB(sb); - - if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr) - return 0; - if (old->addr.sin_port != server->addr.sin_port) - return 0; - return !nfs_compare_fh(&old->fh, &server->fh); -} - -static int nfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) -{ - int error; - struct nfs_server *server = NULL; - struct super_block *s; - struct nfs_fh *root; - struct nfs_mount_data *data = raw_data; - - error = -EINVAL; - if (data == NULL) { - dprintk("%s: missing data argument\n", __FUNCTION__); - goto out_err_noserver; - } - if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) { - dprintk("%s: bad mount version\n", __FUNCTION__); - goto out_err_noserver; - } - switch (data->version) { - case 1: - data->namlen = 0; - case 2: - data->bsize = 0; - case 3: - if (data->flags & NFS_MOUNT_VER3) { - dprintk("%s: mount structure version %d does not support NFSv3\n", - __FUNCTION__, - data->version); - goto out_err_noserver; - } - data->root.size = NFS2_FHSIZE; - memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); - case 4: - if (data->flags & NFS_MOUNT_SECFLAVOUR) { - dprintk("%s: mount structure version %d does not support strong security\n", - __FUNCTION__, - data->version); - goto out_err_noserver; - } - case 5: - memset(data->context, 0, sizeof(data->context)); - } -#ifndef CONFIG_NFS_V3 - /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */ - error = -EPROTONOSUPPORT; - if (data->flags & NFS_MOUNT_VER3) { - dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__); - goto out_err_noserver; - } -#endif /* CONFIG_NFS_V3 */ - - error = -ENOMEM; - server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); - if (!server) - goto out_err_noserver; - /* Zero out the NFS state stuff */ - init_nfsv4_state(server); - server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); - - root = &server->fh; - if (data->flags & NFS_MOUNT_VER3) - root->size = data->root.size; - else - root->size = NFS2_FHSIZE; - error = -EINVAL; - if (root->size > sizeof(root->data)) { - dprintk("%s: invalid root filehandle\n", __FUNCTION__); - goto out_err; - } - memcpy(root->data, data->root.data, root->size); - - /* We now require that the mount process passes the remote address */ - memcpy(&server->addr, &data->addr, sizeof(server->addr)); - if (server->addr.sin_addr.s_addr == INADDR_ANY) { - dprintk("%s: mount program didn't pass remote address!\n", - __FUNCTION__); - goto out_err; - } - - /* Fire up rpciod if not yet running */ - error = rpciod_up(); - if (error < 0) { - dprintk("%s: couldn't start rpciod! Error = %d\n", - __FUNCTION__, error); - goto out_err; - } - - s = sget(fs_type, nfs_compare_super, nfs_set_super, server); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto out_err_rpciod; - } - - if (s->s_root) - goto out_rpciod_down; - - s->s_flags = flags; - - error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0); - if (error) { - up_write(&s->s_umount); - deactivate_super(s); - return error; - } - s->s_flags |= MS_ACTIVE; - return simple_set_mnt(mnt, s); - -out_rpciod_down: - rpciod_down(); - kfree(server); - return simple_set_mnt(mnt, s); - -out_err_rpciod: - rpciod_down(); -out_err: - kfree(server); -out_err_noserver: - return error; -} - -static void nfs_kill_super(struct super_block *s) -{ - struct nfs_server *server = NFS_SB(s); - - kill_anon_super(s); - - if (!IS_ERR(server->client)) - rpc_shutdown_client(server->client); - if (!IS_ERR(server->client_sys)) - rpc_shutdown_client(server->client_sys); - if (!IS_ERR(server->client_acl)) - rpc_shutdown_client(server->client_acl); - - if (!(server->flags & NFS_MOUNT_NONLM)) - lockd_down(); /* release rpc.lockd */ - - rpciod_down(); /* release rpciod */ - - nfs_free_iostats(server->io_stats); - kfree(server->hostname); - kfree(server); -} - -static struct file_system_type nfs_fs_type = { - .owner = THIS_MODULE, - .name = "nfs", - .get_sb = nfs_get_sb, - .kill_sb = nfs_kill_super, - .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; #ifdef CONFIG_NFS_V4 -static void nfs4_clear_inode(struct inode *); - - -static struct super_operations nfs4_sops = { - .alloc_inode = nfs_alloc_inode, - .destroy_inode = nfs_destroy_inode, - .write_inode = nfs_write_inode, - .delete_inode = nfs_delete_inode, - .statfs = nfs_statfs, - .clear_inode = nfs4_clear_inode, - .umount_begin = nfs_umount_begin, - .show_options = nfs_show_options, - .show_stats = nfs_show_stats, -}; - /* * Clean out any remaining NFSv4 state that might be left over due * to open() calls that passed nfs_atomic_lookup, but failed to call * nfs_open(). */ -static void nfs4_clear_inode(struct inode *inode) +void nfs4_clear_inode(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); @@ -1888,365 +1063,9 @@ static void nfs4_clear_inode(struct inode *inode) nfs4_close_state(state, state->state); } } - - -static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent) -{ - struct nfs_server *server; - struct nfs4_client *clp = NULL; - struct rpc_xprt *xprt = NULL; - struct rpc_clnt *clnt = NULL; - struct rpc_timeout timeparms; - rpc_authflavor_t authflavour; - int err = -EIO; - - sb->s_blocksize_bits = 0; - sb->s_blocksize = 0; - server = NFS_SB(sb); - if (data->rsize != 0) - server->rsize = nfs_block_size(data->rsize, NULL); - if (data->wsize != 0) - server->wsize = nfs_block_size(data->wsize, NULL); - server->flags = data->flags & NFS_MOUNT_FLAGMASK; - server->caps = NFS_CAP_ATOMIC_OPEN; - - server->acregmin = data->acregmin*HZ; - server->acregmax = data->acregmax*HZ; - server->acdirmin = data->acdirmin*HZ; - server->acdirmax = data->acdirmax*HZ; - - server->rpc_ops = &nfs_v4_clientops; - - nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans); - - server->retrans_timeo = timeparms.to_initval; - server->retrans_count = timeparms.to_retries; - - clp = nfs4_get_client(&server->addr.sin_addr); - if (!clp) { - dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__); - return -EIO; - } - - /* Now create transport and client */ - authflavour = RPC_AUTH_UNIX; - if (data->auth_flavourlen != 0) { - if (data->auth_flavourlen != 1) { - dprintk("%s: Invalid number of RPC auth flavours %d.\n", - __FUNCTION__, data->auth_flavourlen); - err = -EINVAL; - goto out_fail; - } - if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) { - err = -EFAULT; - goto out_fail; - } - } - - down_write(&clp->cl_sem); - if (IS_ERR(clp->cl_rpcclient)) { - xprt = xprt_create_proto(data->proto, &server->addr, &timeparms); - if (IS_ERR(xprt)) { - up_write(&clp->cl_sem); - err = PTR_ERR(xprt); - dprintk("%s: cannot create RPC transport. Error = %d\n", - __FUNCTION__, err); - goto out_fail; - } - clnt = rpc_create_client(xprt, server->hostname, &nfs_program, - server->rpc_ops->version, authflavour); - if (IS_ERR(clnt)) { - up_write(&clp->cl_sem); - err = PTR_ERR(clnt); - dprintk("%s: cannot create RPC client. Error = %d\n", - __FUNCTION__, err); - goto out_fail; - } - clnt->cl_intr = 1; - clnt->cl_softrtry = 1; - clp->cl_rpcclient = clnt; - memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr)); - nfs_idmap_new(clp); - } - list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks); - clnt = rpc_clone_client(clp->cl_rpcclient); - if (!IS_ERR(clnt)) - server->nfs4_state = clp; - up_write(&clp->cl_sem); - clp = NULL; - - if (IS_ERR(clnt)) { - err = PTR_ERR(clnt); - dprintk("%s: cannot create RPC client. Error = %d\n", - __FUNCTION__, err); - return err; - } - - server->client = clnt; - - if (server->nfs4_state->cl_idmap == NULL) { - dprintk("%s: failed to create idmapper.\n", __FUNCTION__); - return -ENOMEM; - } - - if (clnt->cl_auth->au_flavor != authflavour) { - struct rpc_auth *auth; - - auth = rpcauth_create(authflavour, clnt); - if (IS_ERR(auth)) { - dprintk("%s: couldn't create credcache!\n", __FUNCTION__); - return PTR_ERR(auth); - } - } - - sb->s_time_gran = 1; - - sb->s_op = &nfs4_sops; - err = nfs_sb_init(sb, authflavour); - if (err == 0) - return 0; -out_fail: - if (clp) - nfs4_put_client(clp); - return err; -} - -static int nfs4_compare_super(struct super_block *sb, void *data) -{ - struct nfs_server *server = data; - struct nfs_server *old = NFS_SB(sb); - - if (strcmp(server->hostname, old->hostname) != 0) - return 0; - if (strcmp(server->mnt_path, old->mnt_path) != 0) - return 0; - return 1; -} - -static void * -nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen) -{ - void *p = NULL; - - if (!src->len) - return ERR_PTR(-EINVAL); - if (src->len < maxlen) - maxlen = src->len; - if (dst == NULL) { - p = dst = kmalloc(maxlen + 1, GFP_KERNEL); - if (p == NULL) - return ERR_PTR(-ENOMEM); - } - if (copy_from_user(dst, src->data, maxlen)) { - kfree(p); - return ERR_PTR(-EFAULT); - } - dst[maxlen] = '\0'; - return dst; -} - -static int nfs4_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) -{ - int error; - struct nfs_server *server; - struct super_block *s; - struct nfs4_mount_data *data = raw_data; - void *p; - - if (data == NULL) { - dprintk("%s: missing data argument\n", __FUNCTION__); - return -EINVAL; - } - if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) { - dprintk("%s: bad mount version\n", __FUNCTION__); - return -EINVAL; - } - - server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); - if (!server) - return -ENOMEM; - /* Zero out the NFS state stuff */ - init_nfsv4_state(server); - server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); - - p = nfs_copy_user_string(NULL, &data->hostname, 256); - if (IS_ERR(p)) - goto out_err; - server->hostname = p; - - p = nfs_copy_user_string(NULL, &data->mnt_path, 1024); - if (IS_ERR(p)) - goto out_err; - server->mnt_path = p; - - p = nfs_copy_user_string(server->ip_addr, &data->client_addr, - sizeof(server->ip_addr) - 1); - if (IS_ERR(p)) - goto out_err; - - /* We now require that the mount process passes the remote address */ - if (data->host_addrlen != sizeof(server->addr)) { - error = -EINVAL; - goto out_free; - } - if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) { - error = -EFAULT; - goto out_free; - } - if (server->addr.sin_family != AF_INET || - server->addr.sin_addr.s_addr == INADDR_ANY) { - dprintk("%s: mount program didn't pass remote IP address!\n", - __FUNCTION__); - error = -EINVAL; - goto out_free; - } - - /* Fire up rpciod if not yet running */ - error = rpciod_up(); - if (error < 0) { - dprintk("%s: couldn't start rpciod! Error = %d\n", - __FUNCTION__, error); - goto out_free; - } - - s = sget(fs_type, nfs4_compare_super, nfs_set_super, server); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto out_free; - } - - if (s->s_root) { - kfree(server->mnt_path); - kfree(server->hostname); - kfree(server); - return simple_set_mnt(mnt, s); - } - - s->s_flags = flags; - - error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0); - if (error) { - up_write(&s->s_umount); - deactivate_super(s); - return error; - } - s->s_flags |= MS_ACTIVE; - return simple_set_mnt(mnt, s); -out_err: - error = PTR_ERR(p); -out_free: - kfree(server->mnt_path); - kfree(server->hostname); - kfree(server); - return error; -} - -static void nfs4_kill_super(struct super_block *sb) -{ - struct nfs_server *server = NFS_SB(sb); - - nfs_return_all_delegations(sb); - kill_anon_super(sb); - - nfs4_renewd_prepare_shutdown(server); - - if (server->client != NULL && !IS_ERR(server->client)) - rpc_shutdown_client(server->client); - - destroy_nfsv4_state(server); - - rpciod_down(); - - nfs_free_iostats(server->io_stats); - kfree(server->hostname); - kfree(server); -} - -static struct file_system_type nfs4_fs_type = { - .owner = THIS_MODULE, - .name = "nfs4", - .get_sb = nfs4_get_sb, - .kill_sb = nfs4_kill_super, - .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, -}; - -static const int nfs_set_port_min = 0; -static const int nfs_set_port_max = 65535; -static int param_set_port(const char *val, struct kernel_param *kp) -{ - char *endp; - int num = simple_strtol(val, &endp, 0); - if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) - return -EINVAL; - *((int *)kp->arg) = num; - return 0; -} - -module_param_call(callback_tcpport, param_set_port, param_get_int, - &nfs_callback_set_tcpport, 0644); - -static int param_set_idmap_timeout(const char *val, struct kernel_param *kp) -{ - char *endp; - int num = simple_strtol(val, &endp, 0); - int jif = num * HZ; - if (endp == val || *endp || num < 0 || jif < num) - return -EINVAL; - *((int *)kp->arg) = jif; - return 0; -} - -module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int, - &nfs_idmap_cache_timeout, 0644); - -#define nfs4_init_once(nfsi) \ - do { \ - INIT_LIST_HEAD(&(nfsi)->open_states); \ - nfsi->delegation = NULL; \ - nfsi->delegation_state = 0; \ - init_rwsem(&nfsi->rwsem); \ - } while(0) - -static inline int register_nfs4fs(void) -{ - int ret; - - ret = nfs_register_sysctl(); - if (ret != 0) - return ret; - ret = register_filesystem(&nfs4_fs_type); - if (ret != 0) - nfs_unregister_sysctl(); - return ret; -} - -static inline void unregister_nfs4fs(void) -{ - unregister_filesystem(&nfs4_fs_type); - nfs_unregister_sysctl(); -} -#else -#define nfs4_init_once(nfsi) \ - do { } while (0) -#define register_nfs4fs() (0) -#define unregister_nfs4fs() #endif -extern int nfs_init_nfspagecache(void); -extern void nfs_destroy_nfspagecache(void); -extern int nfs_init_readpagecache(void); -extern void nfs_destroy_readpagecache(void); -extern int nfs_init_writepagecache(void); -extern void nfs_destroy_writepagecache(void); -#ifdef CONFIG_NFS_DIRECTIO -extern int nfs_init_directcache(void); -extern void nfs_destroy_directcache(void); -#endif - -static kmem_cache_t * nfs_inode_cachep; - -static struct inode *nfs_alloc_inode(struct super_block *sb) +struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL); @@ -2265,11 +1084,21 @@ static struct inode *nfs_alloc_inode(struct super_block *sb) return &nfsi->vfs_inode; } -static void nfs_destroy_inode(struct inode *inode) +void nfs_destroy_inode(struct inode *inode) { kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } +static inline void nfs4_init_once(struct nfs_inode *nfsi) +{ +#ifdef CONFIG_NFS_V4 + INIT_LIST_HEAD(&nfsi->open_states); + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + init_rwsem(&nfsi->rwsem); +#endif +} + static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) { struct nfs_inode *nfsi = (struct nfs_inode *) foo; @@ -2290,7 +1119,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) } } -static int nfs_init_inodecache(void) +static int __init nfs_init_inodecache(void) { nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), @@ -2332,29 +1161,22 @@ static int __init init_nfs_fs(void) if (err) goto out1; -#ifdef CONFIG_NFS_DIRECTIO err = nfs_init_directcache(); if (err) goto out0; -#endif #ifdef CONFIG_PROC_FS rpc_proc_register(&nfs_rpcstat); #endif - err = register_filesystem(&nfs_fs_type); - if (err) - goto out; - if ((err = register_nfs4fs()) != 0) + if ((err = register_nfs_fs()) != 0) goto out; return 0; out: #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif -#ifdef CONFIG_NFS_DIRECTIO nfs_destroy_directcache(); out0: -#endif nfs_destroy_writepagecache(); out1: nfs_destroy_readpagecache(); @@ -2368,9 +1190,7 @@ out4: static void __exit exit_nfs_fs(void) { -#ifdef CONFIG_NFS_DIRECTIO nfs_destroy_directcache(); -#endif nfs_destroy_writepagecache(); nfs_destroy_readpagecache(); nfs_destroy_inodecache(); @@ -2378,8 +1198,7 @@ static void __exit exit_nfs_fs(void) #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif - unregister_filesystem(&nfs_fs_type); - unregister_nfs4fs(); + unregister_nfs_fs(); } /* Not quite true; I just maintain it */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h new file mode 100644 index 000000000000..4fe51c1292bb --- /dev/null +++ b/fs/nfs/internal.h @@ -0,0 +1,186 @@ +/* + * NFS internal definitions + */ + +#include <linux/mount.h> + +struct nfs_clone_mount { + const struct super_block *sb; + const struct dentry *dentry; + struct nfs_fh *fh; + struct nfs_fattr *fattr; + char *hostname; + char *mnt_path; + struct sockaddr_in *addr; + rpc_authflavor_t authflavor; +}; + +/* namespace-nfs4.c */ +#ifdef CONFIG_NFS_V4 +extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry); +#else +static inline +struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) +{ + return ERR_PTR(-ENOENT); +} +#endif + +/* callback_xdr.c */ +extern struct svc_version nfs4_callback_version1; + +/* pagelist.c */ +extern int __init nfs_init_nfspagecache(void); +extern void nfs_destroy_nfspagecache(void); +extern int __init nfs_init_readpagecache(void); +extern void nfs_destroy_readpagecache(void); +extern int __init nfs_init_writepagecache(void); +extern void nfs_destroy_writepagecache(void); + +#ifdef CONFIG_NFS_DIRECTIO +extern int __init nfs_init_directcache(void); +extern void nfs_destroy_directcache(void); +#else +#define nfs_init_directcache() (0) +#define nfs_destroy_directcache() do {} while(0) +#endif + +/* nfs2xdr.c */ +extern struct rpc_procinfo nfs_procedures[]; +extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int); + +/* nfs3xdr.c */ +extern struct rpc_procinfo nfs3_procedures[]; +extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int); + +/* nfs4xdr.c */ +extern int nfs_stat_to_errno(int); +extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); + +/* nfs4proc.c */ +#ifdef CONFIG_NFS_V4 +extern struct rpc_procinfo nfs4_procedures[]; + +extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry, + struct nfs4_fs_locations *fs_locations, + struct page *page); +#endif + +/* inode.c */ +extern struct inode *nfs_alloc_inode(struct super_block *sb); +extern void nfs_destroy_inode(struct inode *); +extern int nfs_write_inode(struct inode *,int); +extern void nfs_clear_inode(struct inode *); +#ifdef CONFIG_NFS_V4 +extern void nfs4_clear_inode(struct inode *); +#endif + +/* super.c */ +extern struct file_system_type nfs_referral_nfs4_fs_type; +extern struct file_system_type clone_nfs_fs_type; +#ifdef CONFIG_NFS_V4 +extern struct file_system_type clone_nfs4_fs_type; +#endif +#ifdef CONFIG_PROC_FS +extern struct rpc_stat nfs_rpcstat; +#endif +extern int __init register_nfs_fs(void); +extern void __exit unregister_nfs_fs(void); + +/* namespace.c */ +extern char *nfs_path(const char *base, const struct dentry *dentry, + char *buffer, ssize_t buflen); + +/* + * Determine the mount path as a string + */ +static inline char * +nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen) +{ +#ifdef CONFIG_NFS_V4 + return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen); +#else + return NULL; +#endif +} + +/* + * Determine the device name as a string + */ +static inline char *nfs_devname(const struct vfsmount *mnt_parent, + const struct dentry *dentry, + char *buffer, ssize_t buflen) +{ + return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen); +} + +/* + * Determine the actual block size (and log2 thereof) + */ +static inline +unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) +{ + /* make sure blocksize is a power of two */ + if ((bsize & (bsize - 1)) || nrbitsp) { + unsigned char nrbits; + + for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--) + ; + bsize = 1 << nrbits; + if (nrbitsp) + *nrbitsp = nrbits; + } + + return bsize; +} + +/* + * Calculate the number of 512byte blocks used. + */ +static inline unsigned long nfs_calc_block_size(u64 tsize) +{ + loff_t used = (tsize + 511) >> 9; + return (used > ULONG_MAX) ? ULONG_MAX : used; +} + +/* + * Compute and set NFS server blocksize + */ +static inline +unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) +{ + if (bsize < NFS_MIN_FILE_IO_SIZE) + bsize = NFS_DEF_FILE_IO_SIZE; + else if (bsize >= NFS_MAX_FILE_IO_SIZE) + bsize = NFS_MAX_FILE_IO_SIZE; + + return nfs_block_bits(bsize, nrbitsp); +} + +/* + * Determine the maximum file size for a superblock + */ +static inline +void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) +{ + sb->s_maxbytes = (loff_t)maxfilesize; + if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0) + sb->s_maxbytes = MAX_LFS_FILESIZE; +} + +/* + * Check if the string represents a "valid" IPv4 address + */ +static inline int valid_ipaddr4(const char *buf) +{ + int rc, count, in[4]; + + rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); + if (rc != 4) + return -EINVAL; + for (count = 0; count < 4; count++) { + if (in[count] > 255) + return -EINVAL; + } + return 0; +} diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c new file mode 100644 index 000000000000..19b98ca468eb --- /dev/null +++ b/fs/nfs/namespace.c @@ -0,0 +1,229 @@ +/* + * linux/fs/nfs/namespace.c + * + * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com> + * + * NFS namespace + */ + +#include <linux/config.h> + +#include <linux/dcache.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/nfs_fs.h> +#include <linux/string.h> +#include <linux/sunrpc/clnt.h> +#include <linux/vfs.h> +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +static void nfs_expire_automounts(void *list); + +LIST_HEAD(nfs_automount_list); +static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list); +int nfs_mountpoint_expiry_timeout = 500 * HZ; + +/* + * nfs_path - reconstruct the path given an arbitrary dentry + * @base - arbitrary string to prepend to the path + * @dentry - pointer to dentry + * @buffer - result buffer + * @buflen - length of buffer + * + * Helper function for constructing the path from the + * root dentry to an arbitrary hashed dentry. + * + * This is mainly for use in figuring out the path on the + * server side when automounting on top of an existing partition. + */ +char *nfs_path(const char *base, const struct dentry *dentry, + char *buffer, ssize_t buflen) +{ + char *end = buffer+buflen; + int namelen; + + *--end = '\0'; + buflen--; + spin_lock(&dcache_lock); + while (!IS_ROOT(dentry)) { + namelen = dentry->d_name.len; + buflen -= namelen + 1; + if (buflen < 0) + goto Elong; + end -= namelen; + memcpy(end, dentry->d_name.name, namelen); + *--end = '/'; + dentry = dentry->d_parent; + } + spin_unlock(&dcache_lock); + namelen = strlen(base); + /* Strip off excess slashes in base string */ + while (namelen > 0 && base[namelen - 1] == '/') + namelen--; + buflen -= namelen; + if (buflen < 0) + goto Elong; + end -= namelen; + memcpy(end, base, namelen); + return end; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + +/* + * nfs_follow_mountpoint - handle crossing a mountpoint on the server + * @dentry - dentry of mountpoint + * @nd - nameidata info + * + * When we encounter a mountpoint on the server, we want to set up + * a mountpoint on the client too, to prevent inode numbers from + * colliding, and to allow "df" to work properly. + * On NFSv4, we also want to allow for the fact that different + * filesystems may be migrated to different servers in a failover + * situation, and that different filesystems may want to use + * different security flavours. + */ +static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) +{ + struct vfsmount *mnt; + struct nfs_server *server = NFS_SERVER(dentry->d_inode); + struct dentry *parent; + struct nfs_fh fh; + struct nfs_fattr fattr; + int err; + + BUG_ON(IS_ROOT(dentry)); + dprintk("%s: enter\n", __FUNCTION__); + dput(nd->dentry); + nd->dentry = dget(dentry); + if (d_mountpoint(nd->dentry)) + goto out_follow; + /* Look it up again */ + parent = dget_parent(nd->dentry); + err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr); + dput(parent); + if (err != 0) + goto out_err; + + if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) + mnt = nfs_do_refmount(nd->mnt, nd->dentry); + else + mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr); + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) + goto out_err; + + mntget(mnt); + err = do_add_mount(mnt, nd, nd->mnt->mnt_flags|MNT_SHRINKABLE, &nfs_automount_list); + if (err < 0) { + mntput(mnt); + if (err == -EBUSY) + goto out_follow; + goto out_err; + } + mntput(nd->mnt); + dput(nd->dentry); + nd->mnt = mnt; + nd->dentry = dget(mnt->mnt_root); + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); +out: + dprintk("%s: done, returned %d\n", __FUNCTION__, err); + return ERR_PTR(err); +out_err: + path_release(nd); + goto out; +out_follow: + while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry)) + ; + err = 0; + goto out; +} + +struct inode_operations nfs_mountpoint_inode_operations = { + .follow_link = nfs_follow_mountpoint, + .getattr = nfs_getattr, +}; + +struct inode_operations nfs_referral_inode_operations = { + .follow_link = nfs_follow_mountpoint, +}; + +static void nfs_expire_automounts(void *data) +{ + struct list_head *list = (struct list_head *)data; + + mark_mounts_for_expiry(list); + if (!list_empty(list)) + schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); +} + +void nfs_release_automount_timer(void) +{ + if (list_empty(&nfs_automount_list)) { + cancel_delayed_work(&nfs_automount_task); + flush_scheduled_work(); + } +} + +/* + * Clone a mountpoint of the appropriate type + */ +static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname, + struct nfs_clone_mount *mountdata) +{ +#ifdef CONFIG_NFS_V4 + struct vfsmount *mnt = NULL; + switch (server->rpc_ops->version) { + case 2: + case 3: + mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata); + break; + case 4: + mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata); + } + return mnt; +#else + return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata); +#endif +} + +/** + * nfs_do_submount - set up mountpoint when crossing a filesystem boundary + * @mnt_parent - mountpoint of parent directory + * @dentry - parent directory + * @fh - filehandle for new root dentry + * @fattr - attributes for new root inode + * + */ +struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, + const struct dentry *dentry, struct nfs_fh *fh, + struct nfs_fattr *fattr) +{ + struct nfs_clone_mount mountdata = { + .sb = mnt_parent->mnt_sb, + .dentry = dentry, + .fh = fh, + .fattr = fattr, + }; + struct vfsmount *mnt = ERR_PTR(-ENOMEM); + char *page = (char *) __get_free_page(GFP_USER); + char *devname; + + dprintk("%s: submounting on %s/%s\n", __FUNCTION__, + dentry->d_parent->d_name.name, + dentry->d_name.name); + if (page == NULL) + goto out; + devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE); + mnt = (struct vfsmount *)devname; + if (IS_ERR(devname)) + goto free_page; + mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata); +free_page: + free_page((unsigned long)page); +out: + dprintk("%s: done\n", __FUNCTION__); + return mnt; +} diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index f0015fa876e1..67391eef6b93 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -23,12 +23,11 @@ #include <linux/nfs.h> #include <linux/nfs2.h> #include <linux/nfs_fs.h> +#include "internal.h" #define NFSDBG_FACILITY NFSDBG_XDR /* #define NFS_PARANOIA 1 */ -extern int nfs_stat_to_errno(int stat); - /* Mapping from NFS error code to "errno" error code. */ #define errno_NFSERR_IO EIO @@ -131,7 +130,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr) fattr->du.nfs2.blocksize = ntohl(*p++); rdev = ntohl(*p++); fattr->du.nfs2.blocks = ntohl(*p++); - fattr->fsid_u.nfs3 = ntohl(*p++); + fattr->fsid.major = ntohl(*p++); + fattr->fsid.minor = 0; fattr->fileid = ntohl(*p++); p = xdr_decode_time(p, &fattr->atime); p = xdr_decode_time(p, &fattr->mtime); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 33287879bd23..7322da4d2055 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -172,8 +172,10 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl, inode->i_ino, acl, dfacl); spin_lock(&inode->i_lock); __nfs3_forget_cached_acls(NFS_I(inode)); - nfsi->acl_access = posix_acl_dup(acl); - nfsi->acl_default = posix_acl_dup(dfacl); + if (!IS_ERR(acl)) + nfsi->acl_access = posix_acl_dup(acl); + if (!IS_ERR(dfacl)) + nfsi->acl_default = posix_acl_dup(dfacl); spin_unlock(&inode->i_lock); } @@ -254,7 +256,9 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) res.acl_access = NULL; } } - nfs3_cache_acls(inode, res.acl_access, res.acl_default); + nfs3_cache_acls(inode, + (res.mask & NFS_ACL) ? res.acl_access : ERR_PTR(-EINVAL), + (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL)); switch(type) { case ACL_TYPE_ACCESS: @@ -329,6 +333,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, switch (status) { case 0: status = nfs_refresh_inode(inode, &fattr); + nfs3_cache_acls(inode, acl, dfacl); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index cf186f0d2b3b..7143b1f82cea 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -20,11 +20,10 @@ #include <linux/nfs_mount.h> #include "iostat.h" +#include "internal.h" #define NFSDBG_FACILITY NFSDBG_PROC -extern struct rpc_procinfo nfs3_procedures[]; - /* A wrapper to handle the EJUKEBOX error message */ static int nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) @@ -809,8 +808,6 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int); - static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) { if (nfs3_async_handle_jukebox(task, data->inode)) diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index ec233619687e..0250269e9753 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -22,14 +22,13 @@ #include <linux/nfs3.h> #include <linux/nfs_fs.h> #include <linux/nfsacl.h> +#include "internal.h" #define NFSDBG_FACILITY NFSDBG_XDR /* Mapping from NFS error code to "errno" error code. */ #define errno_NFSERR_IO EIO -extern int nfs_stat_to_errno(int); - /* * Declare the space requirements for NFS arguments and replies as * number of 32bit-words @@ -166,7 +165,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr) if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor) fattr->rdev = 0; - p = xdr_decode_hyper(p, &fattr->fsid_u.nfs3); + p = xdr_decode_hyper(p, &fattr->fsid.major); + fattr->fsid.minor = 0; p = xdr_decode_hyper(p, &fattr->fileid); p = xdr_decode_time3(p, &fattr->atime); p = xdr_decode_time3(p, &fattr->mtime); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 0f5e4e7cddec..9a102860df37 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -217,6 +217,9 @@ extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *); extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state); extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); +extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); +extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry, + struct nfs4_fs_locations *fs_locations, struct page *page); extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops; extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops; @@ -225,6 +228,7 @@ extern const u32 nfs4_fattr_bitmap[2]; extern const u32 nfs4_statfs_bitmap[2]; extern const u32 nfs4_pathconf_bitmap[2]; extern const u32 nfs4_fsinfo_bitmap[2]; +extern const u32 nfs4_fs_locations_bitmap[2]; /* nfs4renewd.c */ extern void nfs4_schedule_state_renewal(struct nfs4_client *); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c new file mode 100644 index 000000000000..ea38d27b74e6 --- /dev/null +++ b/fs/nfs/nfs4namespace.c @@ -0,0 +1,201 @@ +/* + * linux/fs/nfs/nfs4namespace.c + * + * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com> + * + * NFSv4 namespace + */ + +#include <linux/config.h> + +#include <linux/dcache.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/nfs_fs.h> +#include <linux/string.h> +#include <linux/sunrpc/clnt.h> +#include <linux/vfs.h> +#include <linux/inet.h> +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +/* + * Check if fs_root is valid + */ +static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname, + char *buffer, ssize_t buflen) +{ + char *end = buffer + buflen; + int n; + + *--end = '\0'; + buflen--; + + n = pathname->ncomponents; + while (--n >= 0) { + struct nfs4_string *component = &pathname->components[n]; + buflen -= component->len + 1; + if (buflen < 0) + goto Elong; + end -= component->len; + memcpy(end, component->data, component->len); + *--end = '/'; + } + return end; +Elong: + return ERR_PTR(-ENAMETOOLONG); +} + + +/** + * nfs_follow_referral - set up mountpoint when hitting a referral on moved error + * @mnt_parent - mountpoint of parent directory + * @dentry - parent directory + * @fspath - fs path returned in fs_locations + * @mntpath - mount path to new server + * @hostname - hostname of new server + * @addr - host addr of new server + * + */ +static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, + const struct dentry *dentry, + struct nfs4_fs_locations *locations) +{ + struct vfsmount *mnt = ERR_PTR(-ENOENT); + struct nfs_clone_mount mountdata = { + .sb = mnt_parent->mnt_sb, + .dentry = dentry, + .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, + }; + char *page, *page2; + char *path, *fs_path; + char *devname; + int loc, s; + + if (locations == NULL || locations->nlocations <= 0) + goto out; + + dprintk("%s: referral at %s/%s\n", __FUNCTION__, + dentry->d_parent->d_name.name, dentry->d_name.name); + + /* Ensure fs path is a prefix of current dentry path */ + page = (char *) __get_free_page(GFP_USER); + if (page == NULL) + goto out; + page2 = (char *) __get_free_page(GFP_USER); + if (page2 == NULL) + goto out; + + path = nfs4_path(dentry, page, PAGE_SIZE); + if (IS_ERR(path)) + goto out_free; + + fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE); + if (IS_ERR(fs_path)) + goto out_free; + + if (strncmp(path, fs_path, strlen(fs_path)) != 0) { + dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path); + goto out_free; + } + + devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE); + if (IS_ERR(devname)) { + mnt = (struct vfsmount *)devname; + goto out_free; + } + + loc = 0; + while (loc < locations->nlocations && IS_ERR(mnt)) { + struct nfs4_fs_location *location = &locations->locations[loc]; + char *mnt_path; + + if (location == NULL || location->nservers <= 0 || + location->rootpath.ncomponents == 0) { + loc++; + continue; + } + + mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); + if (IS_ERR(mnt_path)) { + loc++; + continue; + } + mountdata.mnt_path = mnt_path; + + s = 0; + while (s < location->nservers) { + struct sockaddr_in addr = {}; + + if (location->servers[s].len <= 0 || + valid_ipaddr4(location->servers[s].data) < 0) { + s++; + continue; + } + + mountdata.hostname = location->servers[s].data; + addr.sin_addr.s_addr = in_aton(mountdata.hostname); + addr.sin_family = AF_INET; + addr.sin_port = htons(NFS_PORT); + mountdata.addr = &addr; + + mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata); + if (!IS_ERR(mnt)) { + break; + } + s++; + } + loc++; + } + +out_free: + free_page((unsigned long)page); + free_page((unsigned long)page2); +out: + dprintk("%s: done\n", __FUNCTION__); + return mnt; +} + +/* + * nfs_do_refmount - handle crossing a referral on server + * @dentry - dentry of referral + * @nd - nameidata info + * + */ +struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) +{ + struct vfsmount *mnt = ERR_PTR(-ENOENT); + struct dentry *parent; + struct nfs4_fs_locations *fs_locations = NULL; + struct page *page; + int err; + + /* BUG_ON(IS_ROOT(dentry)); */ + dprintk("%s: enter\n", __FUNCTION__); + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out; + + fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (fs_locations == NULL) + goto out_free; + + /* Get locations */ + parent = dget_parent(dentry); + dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name); + err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page); + dput(parent); + if (err != 0 || fs_locations->nlocations <= 0 || + fs_locations->fs_path.ncomponents <= 0) + goto out_free; + + mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations); +out_free: + __free_page(page); + kfree(fs_locations); +out: + dprintk("%s: done\n", __FUNCTION__); + return mnt; +} diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d86c0db7b1e8..b4916b092194 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -65,8 +65,6 @@ static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *) static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp); -extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); -extern struct rpc_procinfo nfs4_procedures[]; /* Prevent leaks of NFSv4 errors into userland */ int nfs4_map_errors(int err) @@ -121,6 +119,25 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE 0 }; +const u32 nfs4_fs_locations_bitmap[2] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEID + | FATTR4_WORD0_FS_LOCATIONS, + FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_MOUNTED_ON_FILEID +}; + static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry, struct nfs4_readdir_arg *readdir) { @@ -185,15 +202,15 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp spin_unlock(&clp->cl_lock); } -static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinfo) +static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) { - struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_inode *nfsi = NFS_I(dir); - spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + spin_lock(&dir->i_lock); + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; if (cinfo->before == nfsi->change_attr && cinfo->atomic) nfsi->change_attr = cinfo->after; - spin_unlock(&inode->i_lock); + spin_unlock(&dir->i_lock); } struct nfs4_opendata { @@ -1331,7 +1348,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f return status; } -static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) +int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { struct nfs4_exception exception = { }; int err; @@ -1443,6 +1460,50 @@ out: return nfs4_map_errors(status); } +/* + * Get locations and (maybe) other attributes of a referral. + * Note that we'll actually follow the referral later when + * we detect fsid mismatch in inode revalidation + */ +static int nfs4_get_referral(struct inode *dir, struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle) +{ + int status = -ENOMEM; + struct page *page = NULL; + struct nfs4_fs_locations *locations = NULL; + struct dentry dentry = {}; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + goto out; + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (locations == NULL) + goto out; + + dentry.d_name.name = name->name; + dentry.d_name.len = name->len; + status = nfs4_proc_fs_locations(dir, &dentry, locations, page); + if (status != 0) + goto out; + /* Make sure server returned a different fsid for the referral */ + if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { + dprintk("%s: server did not return a different fsid for a referral at %s\n", __FUNCTION__, name->name); + status = -EIO; + goto out; + } + + memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); + fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL; + if (!fattr->mode) + fattr->mode = S_IFDIR; + memset(fhandle, 0, sizeof(struct nfs_fh)); +out: + if (page) + __free_page(page); + if (locations) + kfree(locations); + return status; +} + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs4_getattr_arg args = { @@ -1547,6 +1608,8 @@ static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name, dprintk("NFS call lookup %s\n", name->name); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + if (status == -NFS4ERR_MOVED) + status = nfs4_get_referral(dir, name, fattr, fhandle); dprintk("NFS reply lookup: %d\n", status); return status; } @@ -2008,7 +2071,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr * if (!status) { update_changeattr(dir, &res.cinfo); nfs_post_op_update_inode(dir, res.dir_attr); - nfs_refresh_inode(inode, res.fattr); + nfs_post_op_update_inode(inode, res.fattr); } return status; @@ -3570,6 +3633,36 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) return len; } +int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry, + struct nfs4_fs_locations *fs_locations, struct page *page) +{ + struct nfs_server *server = NFS_SERVER(dir); + u32 bitmask[2] = { + [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, + [1] = FATTR4_WORD1_MOUNTED_ON_FILEID, + }; + struct nfs4_fs_locations_arg args = { + .dir_fh = NFS_FH(dir), + .name = &dentry->d_name, + .page = page, + .bitmask = bitmask, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS], + .rpc_argp = &args, + .rpc_resp = fs_locations, + }; + int status; + + dprintk("%s: start\n", __FUNCTION__); + fs_locations->fattr.valid = 0; + fs_locations->server = server; + fs_locations->nlocations = 0; + status = rpc_call_sync(server->client, &msg, 0); + dprintk("%s: returned status = %d\n", __FUNCTION__, status); + return status; +} + struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = { .recover_open = nfs4_open_reclaim, .recover_lock = nfs4_lock_reclaim, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7c5d70efe720..1750d996f49f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -411,6 +411,15 @@ static int nfs_stat_to_errno(int); #define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) +#define NFS4_enc_fs_locations_sz \ + (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_fs_locations_sz \ + (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz) static struct { unsigned int mode; @@ -722,6 +731,13 @@ static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask) bitmask[1] & nfs4_fsinfo_bitmap[1]); } +static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask) +{ + return encode_getattr_two(xdr, + bitmask[0] & nfs4_fs_locations_bitmap[0], + bitmask[1] & nfs4_fs_locations_bitmap[1]); +} + static int encode_getfh(struct xdr_stream *xdr) { uint32_t *p; @@ -2003,6 +2019,38 @@ out: } /* + * Encode FS_LOCATIONS request + */ +static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .nops = 3, + }; + struct rpc_auth *auth = req->rq_task->tk_auth; + int replen; + int status; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, &hdr); + if ((status = encode_putfh(&xdr, args->dir_fh)) != 0) + goto out; + if ((status = encode_lookup(&xdr, args->name)) != 0) + goto out; + if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0) + goto out; + /* set up reply + * toplevel_status + OP_PUTFH + status + * + OP_LOOKUP + status + OP_GETATTR + status = 7 + */ + replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2; + xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page, + 0, PAGE_SIZE); +out: + return status; +} + +/* * START OF "GENERIC" DECODE ROUTINES. * These may look a little ugly since they are imported from a "generic" * set of XDR encode/decode routines which are intended to be shared by @@ -2036,7 +2084,7 @@ out: } \ } while (0) -static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string) +static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { uint32_t *p; @@ -2087,7 +2135,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp) { uint32_t *p; - uint32_t strlen; + unsigned int strlen; char *str; READ_BUF(12); @@ -2217,7 +2265,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, return 0; } -static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fsid *fsid) +static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) { uint32_t *p; @@ -2285,6 +2333,22 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t return 0; } +static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) +{ + uint32_t *p; + + *fileid = 0; + if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { + READ_BUF(8); + READ64(*fileid); + bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + } + dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid); + return 0; +} + static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) { uint32_t *p; @@ -2336,6 +2400,116 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin return status; } +static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) +{ + int n; + uint32_t *p; + int status = 0; + + READ_BUF(4); + READ32(n); + if (n < 0) + goto out_eio; + if (n == 0) + goto root_path; + dprintk("path "); + path->ncomponents = 0; + while (path->ncomponents < n) { + struct nfs4_string *component = &path->components[path->ncomponents]; + status = decode_opaque_inline(xdr, &component->len, &component->data); + if (unlikely(status != 0)) + goto out_eio; + if (path->ncomponents != n) + dprintk("/"); + dprintk("%s", component->data); + if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS) + path->ncomponents++; + else { + dprintk("cannot parse %d components in path\n", n); + goto out_eio; + } + } +out: + dprintk("\n"); + return status; +root_path: +/* a root pathname is sent as a zero component4 */ + path->ncomponents = 1; + path->components[0].len=0; + path->components[0].data=NULL; + dprintk("path /\n"); + goto out; +out_eio: + dprintk(" status %d", status); + status = -EIO; + goto out; +} + +static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) +{ + int n; + uint32_t *p; + int status = -EIO; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U))) + goto out; + status = 0; + if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS))) + goto out; + dprintk("%s: fsroot ", __FUNCTION__); + status = decode_pathname(xdr, &res->fs_path); + if (unlikely(status != 0)) + goto out; + READ_BUF(4); + READ32(n); + if (n <= 0) + goto out_eio; + res->nlocations = 0; + while (res->nlocations < n) { + int m; + struct nfs4_fs_location *loc = &res->locations[res->nlocations]; + + READ_BUF(4); + READ32(m); + if (m <= 0) + goto out_eio; + + loc->nservers = 0; + dprintk("%s: servers ", __FUNCTION__); + while (loc->nservers < m) { + struct nfs4_string *server = &loc->servers[loc->nservers]; + status = decode_opaque_inline(xdr, &server->len, &server->data); + if (unlikely(status != 0)) + goto out_eio; + dprintk("%s ", server->data); + if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS) + loc->nservers++; + else { + int i; + dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations); + for (i = loc->nservers; i < m; i++) { + int len; + char *data; + status = decode_opaque_inline(xdr, &len, &data); + if (unlikely(status != 0)) + goto out_eio; + } + } + } + status = decode_pathname(xdr, &loc->rootpath); + if (unlikely(status != 0)) + goto out_eio; + if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES) + res->nlocations++; + } +out: + dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status); + return status; +out_eio: + status = -EIO; + goto out; +} + static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) { uint32_t *p; @@ -2841,6 +3015,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons bitmap[2] = {0}, type; int status, fmode = 0; + uint64_t fileid; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) goto xdr_error; @@ -2863,10 +3038,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons goto xdr_error; if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0) goto xdr_error; - if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid_u.nfs4)) != 0) + if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0) goto xdr_error; if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0) goto xdr_error; + if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr, + struct nfs4_fs_locations, + fattr))) != 0) + goto xdr_error; if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0) goto xdr_error; fattr->mode |= fmode; @@ -2886,6 +3065,10 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons goto xdr_error; if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0) goto xdr_error; + if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0) + goto xdr_error; + if (fattr->fileid == 0 && fileid != 0) + fattr->fileid = fileid; if ((status = verify_attr_len(xdr, savep, attrlen)) == 0) fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4; xdr_error: @@ -3350,8 +3533,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, attrlen, recvd); return -EINVAL; } - if (attrlen <= *acl_len) - xdr_read_pages(xdr, attrlen); + xdr_read_pages(xdr, attrlen); *acl_len = attrlen; } else status = -EOPNOTSUPP; @@ -4211,6 +4393,29 @@ out: return status; } +/* + * FS_LOCATIONS request + */ +static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &req->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status != 0) + goto out; + if ((status = decode_putfh(&xdr)) != 0) + goto out; + if ((status = decode_lookup(&xdr)) != 0) + goto out; + xdr_enter_page(&xdr, PAGE_SIZE); + status = decode_getfattr(&xdr, &res->fattr, res->server); +out: + return status; +} + uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus) { uint32_t bitmap[2] = {0}; @@ -4382,6 +4587,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), PROC(GETACL, enc_getacl, dec_getacl), PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), }; struct rpc_version nfs_version4 = { diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 106aca388ebc..d89f6fb3b3a3 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -325,6 +325,7 @@ out: /** * nfs_scan_list - Scan a list for matching requests + * @nfsi: NFS inode * @head: One of the NFS inode request lists * @dst: Destination list * @idx_start: lower bound of page->index to scan @@ -336,14 +337,15 @@ out: * The requests are *not* checked to ensure that they form a contiguous set. * You must be holding the inode's req_lock when calling this function */ -int -nfs_scan_list(struct list_head *head, struct list_head *dst, - unsigned long idx_start, unsigned int npages) +int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head, + struct list_head *dst, unsigned long idx_start, + unsigned int npages) { - struct list_head *pos, *tmp; - struct nfs_page *req; - unsigned long idx_end; - int res; + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; + unsigned long idx_end; + int found, i; + int res; res = 0; if (npages == 0) @@ -351,25 +353,32 @@ nfs_scan_list(struct list_head *head, struct list_head *dst, else idx_end = idx_start + npages - 1; - list_for_each_safe(pos, tmp, head) { - - req = nfs_list_entry(pos); - - if (req->wb_index < idx_start) - continue; - if (req->wb_index > idx_end) + for (;;) { + found = radix_tree_gang_lookup(&nfsi->nfs_page_tree, + (void **)&pgvec[0], idx_start, + NFS_SCAN_MAXENTRIES); + if (found <= 0) break; + for (i = 0; i < found; i++) { + req = pgvec[i]; + if (req->wb_index > idx_end) + goto out; + idx_start = req->wb_index + 1; + if (req->wb_list_head != head) + continue; + if (nfs_set_page_writeback_locked(req)) { + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + res++; + } + } - if (!nfs_set_page_writeback_locked(req)) - continue; - nfs_list_remove_request(req); - nfs_list_add_request(req, dst); - res++; } +out: return res; } -int nfs_init_nfspagecache(void) +int __init nfs_init_nfspagecache(void) { nfs_page_cachep = kmem_cache_create("nfs_page", sizeof(struct nfs_page), diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 9dd85cac2df0..b3899ea3229e 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -44,11 +44,10 @@ #include <linux/nfs_page.h> #include <linux/lockd/bind.h> #include <linux/smp_lock.h> +#include "internal.h" #define NFSDBG_FACILITY NFSDBG_PROC -extern struct rpc_procinfo nfs_procedures[]; - /* * Bare-bones access to getattr: this is for nfs_read_super. */ @@ -611,8 +610,6 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return 0; } -extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int); - static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) { if (task->tk_status >= 0) { diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 624ca7146b6b..32cf3773af0c 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -51,14 +51,11 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); - if (pagecount < NFS_PAGEVEC_SIZE) - p->pagevec = &p->page_array[0]; + if (pagecount <= ARRAY_SIZE(p->page_array)) + p->pagevec = p->page_array; else { - size_t size = ++pagecount * sizeof(struct page *); - p->pagevec = kmalloc(size, GFP_NOFS); - if (p->pagevec) { - memset(p->pagevec, 0, size); - } else { + p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); + if (!p->pagevec) { mempool_free(p, nfs_rdata_mempool); p = NULL; } @@ -104,6 +101,28 @@ int nfs_return_empty_page(struct page *page) return 0; } +static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) +{ + unsigned int remainder = data->args.count - data->res.count; + unsigned int base = data->args.pgbase + data->res.count; + unsigned int pglen; + struct page **pages; + + if (data->res.eof == 0 || remainder == 0) + return; + /* + * Note: "remainder" can never be negative, since we check for + * this in the XDR code. + */ + pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; + base &= ~PAGE_CACHE_MASK; + pglen = PAGE_CACHE_SIZE - base; + if (pglen < remainder) + memclear_highpage_flush(*pages, base, pglen); + else + memclear_highpage_flush(*pages, base, remainder); +} + /* * Read a page synchronously. */ @@ -177,11 +196,9 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode, NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; spin_unlock(&inode->i_lock); - if (count) - memclear_highpage_flush(page, rdata->args.pgbase, count); - SetPageUptodate(page); - if (PageError(page)) - ClearPageError(page); + nfs_readpage_truncate_uninitialised_page(rdata); + if (rdata->res.eof || rdata->res.count == rdata->args.count) + SetPageUptodate(page); result = 0; io_error: @@ -436,20 +453,12 @@ static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata) struct nfs_page *req = data->req; struct page *page = req->wb_page; + if (likely(task->tk_status >= 0)) + nfs_readpage_truncate_uninitialised_page(data); + else + SetPageError(page); if (nfs_readpage_result(task, data) != 0) return; - if (task->tk_status >= 0) { - unsigned int request = data->args.count; - unsigned int result = data->res.count; - - if (result < request) { - memclear_highpage_flush(page, - data->args.pgbase + result, - request - result); - } - } else - SetPageError(page); - if (atomic_dec_and_test(&req->wb_complete)) { if (!PageError(page)) SetPageUptodate(page); @@ -462,6 +471,40 @@ static const struct rpc_call_ops nfs_read_partial_ops = { .rpc_release = nfs_readdata_release, }; +static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) +{ + unsigned int count = data->res.count; + unsigned int base = data->args.pgbase; + struct page **pages; + + if (unlikely(count == 0)) + return; + pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; + base &= ~PAGE_CACHE_MASK; + count += base; + for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) + SetPageUptodate(*pages); + /* + * Was this an eof or a short read? If the latter, don't mark the page + * as uptodate yet. + */ + if (count > 0 && (data->res.eof || data->args.count == data->res.count)) + SetPageUptodate(*pages); +} + +static void nfs_readpage_set_pages_error(struct nfs_read_data *data) +{ + unsigned int count = data->args.count; + unsigned int base = data->args.pgbase; + struct page **pages; + + pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; + base &= ~PAGE_CACHE_MASK; + count += base; + for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) + SetPageError(*pages); +} + /* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). @@ -469,27 +512,24 @@ static const struct rpc_call_ops nfs_read_partial_ops = { static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; - unsigned int count = data->res.count; + /* + * Note: nfs_readpage_result may change the values of + * data->args. In the multi-page case, we therefore need + * to ensure that we call the next nfs_readpage_set_page_uptodate() + * first in the multi-page case. + */ + if (likely(task->tk_status >= 0)) { + nfs_readpage_truncate_uninitialised_page(data); + nfs_readpage_set_pages_uptodate(data); + } else + nfs_readpage_set_pages_error(data); if (nfs_readpage_result(task, data) != 0) return; while (!list_empty(&data->pages)) { struct nfs_page *req = nfs_list_entry(data->pages.next); - struct page *page = req->wb_page; - nfs_list_remove_request(req); - if (task->tk_status >= 0) { - if (count < PAGE_CACHE_SIZE) { - if (count < req->wb_bytes) - memclear_highpage_flush(page, - req->wb_pgbase + count, - req->wb_bytes - count); - count = 0; - } else - count -= PAGE_CACHE_SIZE; - SetPageUptodate(page); - } else - SetPageError(page); + nfs_list_remove_request(req); nfs_readpage_release(req); } } @@ -654,7 +694,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, return ret; } -int nfs_init_readpagecache(void) +int __init nfs_init_readpagecache(void) { nfs_rdata_cachep = kmem_cache_create("nfs_read_data", sizeof(struct nfs_read_data), diff --git a/fs/nfs/super.c b/fs/nfs/super.c new file mode 100644 index 000000000000..e8a9bee74d9d --- /dev/null +++ b/fs/nfs/super.c @@ -0,0 +1,1537 @@ +/* + * linux/fs/nfs/super.c + * + * Copyright (C) 1992 Rick Sladkey + * + * nfs superblock handling functions + * + * Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some + * experimental NFS changes. Modularisation taken straight from SYS5 fs. + * + * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts. + * J.S.Peatfield@damtp.cam.ac.uk + * + * Split from inode.c by David Howells <dhowells@redhat.com> + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/unistd.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/stats.h> +#include <linux/sunrpc/metrics.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/nfs4_mount.h> +#include <linux/lockd/bind.h> +#include <linux/smp_lock.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/nfs_idmap.h> +#include <linux/vfs.h> +#include <linux/inet.h> +#include <linux/nfs_xdr.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "iostat.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +/* Maximum number of readahead requests + * FIXME: this should really be a sysctl so that users may tune it to suit + * their needs. People that do NFS over a slow network, might for + * instance want to reduce it to something closer to 1 for improved + * interactive response. + */ +#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) + +/* + * RPC cruft for NFS + */ +static struct rpc_version * nfs_version[] = { + NULL, + NULL, + &nfs_version2, +#if defined(CONFIG_NFS_V3) + &nfs_version3, +#elif defined(CONFIG_NFS_V4) + NULL, +#endif +#if defined(CONFIG_NFS_V4) + &nfs_version4, +#endif +}; + +static struct rpc_program nfs_program = { + .name = "nfs", + .number = NFS_PROGRAM, + .nrvers = ARRAY_SIZE(nfs_version), + .version = nfs_version, + .stats = &nfs_rpcstat, + .pipe_dir_name = "/nfs", +}; + +struct rpc_stat nfs_rpcstat = { + .program = &nfs_program +}; + + +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; +static struct rpc_version * nfsacl_version[] = { + [3] = &nfsacl_version3, +}; + +struct rpc_program nfsacl_program = { + .name = "nfsacl", + .number = NFS_ACL_PROGRAM, + .nrvers = ARRAY_SIZE(nfsacl_version), + .version = nfsacl_version, + .stats = &nfsacl_rpcstat, +}; +#endif /* CONFIG_NFS_V3_ACL */ + +static void nfs_umount_begin(struct vfsmount *, int); +static int nfs_statfs(struct dentry *, struct kstatfs *); +static int nfs_show_options(struct seq_file *, struct vfsmount *); +static int nfs_show_stats(struct seq_file *, struct vfsmount *); +static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); +static int nfs_clone_nfs_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static void nfs_kill_super(struct super_block *); + +static struct file_system_type nfs_fs_type = { + .owner = THIS_MODULE, + .name = "nfs", + .get_sb = nfs_get_sb, + .kill_sb = nfs_kill_super, + .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +struct file_system_type clone_nfs_fs_type = { + .owner = THIS_MODULE, + .name = "nfs", + .get_sb = nfs_clone_nfs_sb, + .kill_sb = nfs_kill_super, + .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +static struct super_operations nfs_sops = { + .alloc_inode = nfs_alloc_inode, + .destroy_inode = nfs_destroy_inode, + .write_inode = nfs_write_inode, + .statfs = nfs_statfs, + .clear_inode = nfs_clear_inode, + .umount_begin = nfs_umount_begin, + .show_options = nfs_show_options, + .show_stats = nfs_show_stats, +}; + +#ifdef CONFIG_NFS_V4 +static int nfs4_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static int nfs_clone_nfs4_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static int nfs_referral_nfs4_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static void nfs4_kill_super(struct super_block *sb); + +static struct file_system_type nfs4_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .get_sb = nfs4_get_sb, + .kill_sb = nfs4_kill_super, + .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +struct file_system_type clone_nfs4_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .get_sb = nfs_clone_nfs4_sb, + .kill_sb = nfs4_kill_super, + .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +struct file_system_type nfs_referral_nfs4_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .get_sb = nfs_referral_nfs4_sb, + .kill_sb = nfs4_kill_super, + .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, +}; + +static struct super_operations nfs4_sops = { + .alloc_inode = nfs_alloc_inode, + .destroy_inode = nfs_destroy_inode, + .write_inode = nfs_write_inode, + .statfs = nfs_statfs, + .clear_inode = nfs4_clear_inode, + .umount_begin = nfs_umount_begin, + .show_options = nfs_show_options, + .show_stats = nfs_show_stats, +}; +#endif + +#ifdef CONFIG_NFS_V4 +static const int nfs_set_port_min = 0; +static const int nfs_set_port_max = 65535; + +static int param_set_port(const char *val, struct kernel_param *kp) +{ + char *endp; + int num = simple_strtol(val, &endp, 0); + if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) + return -EINVAL; + *((int *)kp->arg) = num; + return 0; +} + +module_param_call(callback_tcpport, param_set_port, param_get_int, + &nfs_callback_set_tcpport, 0644); +#endif + +#ifdef CONFIG_NFS_V4 +static int param_set_idmap_timeout(const char *val, struct kernel_param *kp) +{ + char *endp; + int num = simple_strtol(val, &endp, 0); + int jif = num * HZ; + if (endp == val || *endp || num < 0 || jif < num) + return -EINVAL; + *((int *)kp->arg) = jif; + return 0; +} + +module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int, + &nfs_idmap_cache_timeout, 0644); +#endif + +/* + * Register the NFS filesystems + */ +int __init register_nfs_fs(void) +{ + int ret; + + ret = register_filesystem(&nfs_fs_type); + if (ret < 0) + goto error_0; + +#ifdef CONFIG_NFS_V4 + ret = nfs_register_sysctl(); + if (ret < 0) + goto error_1; + ret = register_filesystem(&nfs4_fs_type); + if (ret < 0) + goto error_2; +#endif + return 0; + +#ifdef CONFIG_NFS_V4 +error_2: + nfs_unregister_sysctl(); +error_1: + unregister_filesystem(&nfs_fs_type); +#endif +error_0: + return ret; +} + +/* + * Unregister the NFS filesystems + */ +void __exit unregister_nfs_fs(void) +{ +#ifdef CONFIG_NFS_V4 + unregister_filesystem(&nfs4_fs_type); + nfs_unregister_sysctl(); +#endif + unregister_filesystem(&nfs_fs_type); +} + +/* + * Deliver file system statistics to userspace + */ +static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct nfs_server *server = NFS_SB(sb); + unsigned char blockbits; + unsigned long blockres; + struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode); + struct nfs_fattr fattr; + struct nfs_fsstat res = { + .fattr = &fattr, + }; + int error; + + lock_kernel(); + + error = server->rpc_ops->statfs(server, rootfh, &res); + buf->f_type = NFS_SUPER_MAGIC; + if (error < 0) + goto out_err; + + /* + * Current versions of glibc do not correctly handle the + * case where f_frsize != f_bsize. Eventually we want to + * report the value of wtmult in this field. + */ + buf->f_frsize = sb->s_blocksize; + + /* + * On most *nix systems, f_blocks, f_bfree, and f_bavail + * are reported in units of f_frsize. Linux hasn't had + * an f_frsize field in its statfs struct until recently, + * thus historically Linux's sys_statfs reports these + * fields in units of f_bsize. + */ + buf->f_bsize = sb->s_blocksize; + blockbits = sb->s_blocksize_bits; + blockres = (1 << blockbits) - 1; + buf->f_blocks = (res.tbytes + blockres) >> blockbits; + buf->f_bfree = (res.fbytes + blockres) >> blockbits; + buf->f_bavail = (res.abytes + blockres) >> blockbits; + + buf->f_files = res.tfiles; + buf->f_ffree = res.afiles; + + buf->f_namelen = server->namelen; + out: + unlock_kernel(); + return 0; + + out_err: + dprintk("%s: statfs error = %d\n", __FUNCTION__, -error); + buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; + goto out; + +} + +static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) +{ + static struct { + rpc_authflavor_t flavour; + const char *str; + } sec_flavours[] = { + { RPC_AUTH_NULL, "null" }, + { RPC_AUTH_UNIX, "sys" }, + { RPC_AUTH_GSS_KRB5, "krb5" }, + { RPC_AUTH_GSS_KRB5I, "krb5i" }, + { RPC_AUTH_GSS_KRB5P, "krb5p" }, + { RPC_AUTH_GSS_LKEY, "lkey" }, + { RPC_AUTH_GSS_LKEYI, "lkeyi" }, + { RPC_AUTH_GSS_LKEYP, "lkeyp" }, + { RPC_AUTH_GSS_SPKM, "spkm" }, + { RPC_AUTH_GSS_SPKMI, "spkmi" }, + { RPC_AUTH_GSS_SPKMP, "spkmp" }, + { -1, "unknown" } + }; + int i; + + for (i=0; sec_flavours[i].flavour != -1; i++) { + if (sec_flavours[i].flavour == flavour) + break; + } + return sec_flavours[i].str; +} + +/* + * Describe the mount options in force on this server representation + */ +static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) +{ + static struct proc_nfs_info { + int flag; + char *str; + char *nostr; + } nfs_info[] = { + { NFS_MOUNT_SOFT, ",soft", ",hard" }, + { NFS_MOUNT_INTR, ",intr", "" }, + { NFS_MOUNT_NOCTO, ",nocto", "" }, + { NFS_MOUNT_NOAC, ",noac", "" }, + { NFS_MOUNT_NONLM, ",nolock", "" }, + { NFS_MOUNT_NOACL, ",noacl", "" }, + { 0, NULL, NULL } + }; + struct proc_nfs_info *nfs_infop; + char buf[12]; + char *proto; + + seq_printf(m, ",vers=%d", nfss->rpc_ops->version); + seq_printf(m, ",rsize=%d", nfss->rsize); + seq_printf(m, ",wsize=%d", nfss->wsize); + if (nfss->acregmin != 3*HZ || showdefaults) + seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ); + if (nfss->acregmax != 60*HZ || showdefaults) + seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ); + if (nfss->acdirmin != 30*HZ || showdefaults) + seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ); + if (nfss->acdirmax != 60*HZ || showdefaults) + seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ); + for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { + if (nfss->flags & nfs_infop->flag) + seq_puts(m, nfs_infop->str); + else + seq_puts(m, nfs_infop->nostr); + } + switch (nfss->client->cl_xprt->prot) { + case IPPROTO_TCP: + proto = "tcp"; + break; + case IPPROTO_UDP: + proto = "udp"; + break; + default: + snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot); + proto = buf; + } + seq_printf(m, ",proto=%s", proto); + seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ); + seq_printf(m, ",retrans=%u", nfss->retrans_count); + seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); +} + +/* + * Describe the mount options on this VFS mountpoint + */ +static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + + nfs_show_mount_options(m, nfss, 0); + + seq_puts(m, ",addr="); + seq_escape(m, nfss->hostname, " \t\n\\"); + + return 0; +} + +/* + * Present statistical information for this VFS mountpoint + */ +static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) +{ + int i, cpu; + struct nfs_server *nfss = NFS_SB(mnt->mnt_sb); + struct rpc_auth *auth = nfss->client->cl_auth; + struct nfs_iostats totals = { }; + + seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS); + + /* + * Display all mount option settings + */ + seq_printf(m, "\n\topts:\t"); + seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw"); + seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); + seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : ""); + seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); + nfs_show_mount_options(m, nfss, 1); + + seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); + + seq_printf(m, "\n\tcaps:\t"); + seq_printf(m, "caps=0x%x", nfss->caps); + seq_printf(m, ",wtmult=%d", nfss->wtmult); + seq_printf(m, ",dtsize=%d", nfss->dtsize); + seq_printf(m, ",bsize=%d", nfss->bsize); + seq_printf(m, ",namelen=%d", nfss->namelen); + +#ifdef CONFIG_NFS_V4 + if (nfss->rpc_ops->version == 4) { + seq_printf(m, "\n\tnfsv4:\t"); + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); + } +#endif + + /* + * Display security flavor in effect for this mount + */ + seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor); + if (auth->au_flavor) + seq_printf(m, ",pseudoflavor=%d", auth->au_flavor); + + /* + * Display superblock I/O counters + */ + for_each_possible_cpu(cpu) { + struct nfs_iostats *stats; + + preempt_disable(); + stats = per_cpu_ptr(nfss->io_stats, cpu); + + for (i = 0; i < __NFSIOS_COUNTSMAX; i++) + totals.events[i] += stats->events[i]; + for (i = 0; i < __NFSIOS_BYTESMAX; i++) + totals.bytes[i] += stats->bytes[i]; + + preempt_enable(); + } + + seq_printf(m, "\n\tevents:\t"); + for (i = 0; i < __NFSIOS_COUNTSMAX; i++) + seq_printf(m, "%lu ", totals.events[i]); + seq_printf(m, "\n\tbytes:\t"); + for (i = 0; i < __NFSIOS_BYTESMAX; i++) + seq_printf(m, "%Lu ", totals.bytes[i]); + seq_printf(m, "\n"); + + rpc_print_iostats(m, nfss->client); + + return 0; +} + +/* + * Begin unmount by attempting to remove all automounted mountpoints we added + * in response to traversals + */ +static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags) +{ + struct nfs_server *server; + struct rpc_clnt *rpc; + + shrink_submounts(vfsmnt, &nfs_automount_list); + if (!(flags & MNT_FORCE)) + return; + /* -EIO all pending I/O */ + server = NFS_SB(vfsmnt->mnt_sb); + rpc = server->client; + if (!IS_ERR(rpc)) + rpc_killall_tasks(rpc); + rpc = server->client_acl; + if (!IS_ERR(rpc)) + rpc_killall_tasks(rpc); +} + +/* + * Obtain the root inode of the file system. + */ +static struct inode * +nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo) +{ + struct nfs_server *server = NFS_SB(sb); + int error; + + error = server->rpc_ops->getroot(server, rootfh, fsinfo); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); + return ERR_PTR(error); + } + + server->fsid = fsinfo->fattr->fsid; + return nfs_fhget(sb, rootfh, fsinfo->fattr); +} + +/* + * Do NFS version-independent mount processing, and sanity checking + */ +static int +nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor) +{ + struct nfs_server *server; + struct inode *root_inode; + struct nfs_fattr fattr; + struct nfs_fsinfo fsinfo = { + .fattr = &fattr, + }; + struct nfs_pathconf pathinfo = { + .fattr = &fattr, + }; + int no_root_error = 0; + unsigned long max_rpc_payload; + + /* We probably want something more informative here */ + snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev)); + + server = NFS_SB(sb); + + sb->s_magic = NFS_SUPER_MAGIC; + + server->io_stats = nfs_alloc_iostats(); + if (server->io_stats == NULL) + return -ENOMEM; + + root_inode = nfs_get_root(sb, &server->fh, &fsinfo); + /* Did getting the root inode fail? */ + if (IS_ERR(root_inode)) { + no_root_error = PTR_ERR(root_inode); + goto out_no_root; + } + sb->s_root = d_alloc_root(root_inode); + if (!sb->s_root) { + no_root_error = -ENOMEM; + goto out_no_root; + } + sb->s_root->d_op = server->rpc_ops->dentry_ops; + + /* mount time stamp, in seconds */ + server->mount_time = jiffies; + + /* Get some general file system info */ + if (server->namelen == 0 && + server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0) + server->namelen = pathinfo.max_namelen; + /* Work out a lot of parameters */ + if (server->rsize == 0) + server->rsize = nfs_block_size(fsinfo.rtpref, NULL); + if (server->wsize == 0) + server->wsize = nfs_block_size(fsinfo.wtpref, NULL); + + if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax) + server->rsize = nfs_block_size(fsinfo.rtmax, NULL); + if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax) + server->wsize = nfs_block_size(fsinfo.wtmax, NULL); + + max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); + if (server->rsize > max_rpc_payload) + server->rsize = max_rpc_payload; + if (server->rsize > NFS_MAX_FILE_IO_SIZE) + server->rsize = NFS_MAX_FILE_IO_SIZE; + server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + if (server->wsize > max_rpc_payload) + server->wsize = max_rpc_payload; + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + if (sb->s_blocksize == 0) + sb->s_blocksize = nfs_block_bits(server->wsize, + &sb->s_blocksize_bits); + server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo.dtpref, NULL); + if (server->dtsize > PAGE_CACHE_SIZE) + server->dtsize = PAGE_CACHE_SIZE; + if (server->dtsize > server->rsize) + server->dtsize = server->rsize; + + if (server->flags & NFS_MOUNT_NOAC) { + server->acregmin = server->acregmax = 0; + server->acdirmin = server->acdirmax = 0; + sb->s_flags |= MS_SYNCHRONOUS; + } + server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; + + nfs_super_set_maxbytes(sb, fsinfo.maxfilesize); + + server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0; + server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0; + + /* We're airborne Set socket buffersize */ + rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); + return 0; + /* Yargs. It didn't work out. */ +out_no_root: + dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error); + if (!IS_ERR(root_inode)) + iput(root_inode); + return no_root_error; +} + +/* + * Initialise the timeout values for a connection + */ +static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans) +{ + to->to_initval = timeo * HZ / 10; + to->to_retries = retrans; + if (!to->to_retries) + to->to_retries = 2; + + switch (proto) { + case IPPROTO_TCP: + if (!to->to_initval) + to->to_initval = 60 * HZ; + if (to->to_initval > NFS_MAX_TCP_TIMEOUT) + to->to_initval = NFS_MAX_TCP_TIMEOUT; + to->to_increment = to->to_initval; + to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); + to->to_exponential = 0; + break; + case IPPROTO_UDP: + default: + if (!to->to_initval) + to->to_initval = 11 * HZ / 10; + if (to->to_initval > NFS_MAX_UDP_TIMEOUT) + to->to_initval = NFS_MAX_UDP_TIMEOUT; + to->to_maxval = NFS_MAX_UDP_TIMEOUT; + to->to_exponential = 1; + break; + } +} + +/* + * Create an RPC client handle. + */ +static struct rpc_clnt * +nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data) +{ + struct rpc_timeout timeparms; + struct rpc_xprt *xprt = NULL; + struct rpc_clnt *clnt = NULL; + int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP; + + nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans); + + server->retrans_timeo = timeparms.to_initval; + server->retrans_count = timeparms.to_retries; + + /* create transport and client */ + xprt = xprt_create_proto(proto, &server->addr, &timeparms); + if (IS_ERR(xprt)) { + dprintk("%s: cannot create RPC transport. Error = %ld\n", + __FUNCTION__, PTR_ERR(xprt)); + return (struct rpc_clnt *)xprt; + } + clnt = rpc_create_client(xprt, server->hostname, &nfs_program, + server->rpc_ops->version, data->pseudoflavor); + if (IS_ERR(clnt)) { + dprintk("%s: cannot create RPC client. Error = %ld\n", + __FUNCTION__, PTR_ERR(xprt)); + goto out_fail; + } + + clnt->cl_intr = 1; + clnt->cl_softrtry = 1; + + return clnt; + +out_fail: + return clnt; +} + +/* + * Clone a server record + */ +static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_clone_mount *data) +{ + struct nfs_server *server = NFS_SB(sb); + struct nfs_server *parent = NFS_SB(data->sb); + struct inode *root_inode; + struct nfs_fsinfo fsinfo; + void *err = ERR_PTR(-ENOMEM); + + sb->s_op = data->sb->s_op; + sb->s_blocksize = data->sb->s_blocksize; + sb->s_blocksize_bits = data->sb->s_blocksize_bits; + sb->s_maxbytes = data->sb->s_maxbytes; + + server->client_sys = server->client_acl = ERR_PTR(-EINVAL); + server->io_stats = nfs_alloc_iostats(); + if (server->io_stats == NULL) + goto out; + + server->client = rpc_clone_client(parent->client); + if (IS_ERR((err = server->client))) + goto out; + + if (!IS_ERR(parent->client_sys)) { + server->client_sys = rpc_clone_client(parent->client_sys); + if (IS_ERR((err = server->client_sys))) + goto out; + } + if (!IS_ERR(parent->client_acl)) { + server->client_acl = rpc_clone_client(parent->client_acl); + if (IS_ERR((err = server->client_acl))) + goto out; + } + root_inode = nfs_fhget(sb, data->fh, data->fattr); + if (!root_inode) + goto out; + sb->s_root = d_alloc_root(root_inode); + if (!sb->s_root) + goto out_put_root; + fsinfo.fattr = data->fattr; + if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0) + nfs_super_set_maxbytes(sb, fsinfo.maxfilesize); + sb->s_root->d_op = server->rpc_ops->dentry_ops; + sb->s_flags |= MS_ACTIVE; + return server; +out_put_root: + iput(root_inode); +out: + return err; +} + +/* + * Copy an existing superblock and attach revised data + */ +static int nfs_clone_generic_sb(struct nfs_clone_mount *data, + struct super_block *(*fill_sb)(struct nfs_server *, struct nfs_clone_mount *), + struct nfs_server *(*fill_server)(struct super_block *, struct nfs_clone_mount *), + struct vfsmount *mnt) +{ + struct nfs_server *server; + struct nfs_server *parent = NFS_SB(data->sb); + struct super_block *sb = ERR_PTR(-EINVAL); + char *hostname; + int error = -ENOMEM; + int len; + + server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL); + if (server == NULL) + goto out_err; + memcpy(server, parent, sizeof(*server)); + hostname = (data->hostname != NULL) ? data->hostname : parent->hostname; + len = strlen(hostname) + 1; + server->hostname = kmalloc(len, GFP_KERNEL); + if (server->hostname == NULL) + goto free_server; + memcpy(server->hostname, hostname, len); + error = rpciod_up(); + if (error != 0) + goto free_hostname; + + sb = fill_sb(server, data); + if (IS_ERR(sb)) { + error = PTR_ERR(sb); + goto kill_rpciod; + } + + if (sb->s_root) + goto out_rpciod_down; + + server = fill_server(sb, data); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out_deactivate; + } + return simple_set_mnt(mnt, sb); +out_deactivate: + up_write(&sb->s_umount); + deactivate_super(sb); + return error; +out_rpciod_down: + rpciod_down(); + kfree(server->hostname); + kfree(server); + return simple_set_mnt(mnt, sb); +kill_rpciod: + rpciod_down(); +free_hostname: + kfree(server->hostname); +free_server: + kfree(server); +out_err: + return error; +} + +/* + * Set up an NFS2/3 superblock + * + * The way this works is that the mount process passes a structure + * in the data argument which contains the server's IP address + * and the root file handle obtained from the server's mount + * daemon. We stash these away in the private superblock fields. + */ +static int +nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent) +{ + struct nfs_server *server; + rpc_authflavor_t authflavor; + + server = NFS_SB(sb); + sb->s_blocksize_bits = 0; + sb->s_blocksize = 0; + if (data->bsize) + sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); + server->flags = data->flags & NFS_MOUNT_FLAGMASK; + + server->acregmin = data->acregmin*HZ; + server->acregmax = data->acregmax*HZ; + server->acdirmin = data->acdirmin*HZ; + server->acdirmax = data->acdirmax*HZ; + + /* Start lockd here, before we might error out */ + if (!(server->flags & NFS_MOUNT_NONLM)) + lockd_up(); + + server->namelen = data->namlen; + server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL); + if (!server->hostname) + return -ENOMEM; + strcpy(server->hostname, data->hostname); + + /* Check NFS protocol revision and initialize RPC op vector + * and file handle pool. */ +#ifdef CONFIG_NFS_V3 + if (server->flags & NFS_MOUNT_VER3) { + server->rpc_ops = &nfs_v3_clientops; + server->caps |= NFS_CAP_READDIRPLUS; + } else { + server->rpc_ops = &nfs_v2_clientops; + } +#else + server->rpc_ops = &nfs_v2_clientops; +#endif + + /* Fill in pseudoflavor for mount version < 5 */ + if (!(data->flags & NFS_MOUNT_SECFLAVOUR)) + data->pseudoflavor = RPC_AUTH_UNIX; + authflavor = data->pseudoflavor; /* save for sb_init() */ + /* XXX maybe we want to add a server->pseudoflavor field */ + + /* Create RPC client handles */ + server->client = nfs_create_client(server, data); + if (IS_ERR(server->client)) + return PTR_ERR(server->client); + /* RFC 2623, sec 2.3.2 */ + if (authflavor != RPC_AUTH_UNIX) { + struct rpc_auth *auth; + + server->client_sys = rpc_clone_client(server->client); + if (IS_ERR(server->client_sys)) + return PTR_ERR(server->client_sys); + auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys); + if (IS_ERR(auth)) + return PTR_ERR(auth); + } else { + atomic_inc(&server->client->cl_count); + server->client_sys = server->client; + } + if (server->flags & NFS_MOUNT_VER3) { +#ifdef CONFIG_NFS_V3_ACL + if (!(server->flags & NFS_MOUNT_NOACL)) { + server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); + /* No errors! Assume that Sun nfsacls are supported */ + if (!IS_ERR(server->client_acl)) + server->caps |= NFS_CAP_ACLS; + } +#else + server->flags &= ~NFS_MOUNT_NOACL; +#endif /* CONFIG_NFS_V3_ACL */ + /* + * The VFS shouldn't apply the umask to mode bits. We will + * do so ourselves when necessary. + */ + sb->s_flags |= MS_POSIXACL; + if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) + server->namelen = NFS3_MAXNAMLEN; + sb->s_time_gran = 1; + } else { + if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) + server->namelen = NFS2_MAXNAMLEN; + } + + sb->s_op = &nfs_sops; + return nfs_sb_init(sb, authflavor); +} + +static int nfs_set_super(struct super_block *s, void *data) +{ + s->s_fs_info = data; + return set_anon_super(s, data); +} + +static int nfs_compare_super(struct super_block *sb, void *data) +{ + struct nfs_server *server = data; + struct nfs_server *old = NFS_SB(sb); + + if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr) + return 0; + if (old->addr.sin_port != server->addr.sin_port) + return 0; + return !nfs_compare_fh(&old->fh, &server->fh); +} + +static int nfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +{ + int error; + struct nfs_server *server = NULL; + struct super_block *s; + struct nfs_fh *root; + struct nfs_mount_data *data = raw_data; + + error = -EINVAL; + if (data == NULL) { + dprintk("%s: missing data argument\n", __FUNCTION__); + goto out_err_noserver; + } + if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) { + dprintk("%s: bad mount version\n", __FUNCTION__); + goto out_err_noserver; + } + switch (data->version) { + case 1: + data->namlen = 0; + case 2: + data->bsize = 0; + case 3: + if (data->flags & NFS_MOUNT_VER3) { + dprintk("%s: mount structure version %d does not support NFSv3\n", + __FUNCTION__, + data->version); + goto out_err_noserver; + } + data->root.size = NFS2_FHSIZE; + memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); + case 4: + if (data->flags & NFS_MOUNT_SECFLAVOUR) { + dprintk("%s: mount structure version %d does not support strong security\n", + __FUNCTION__, + data->version); + goto out_err_noserver; + } + case 5: + memset(data->context, 0, sizeof(data->context)); + } +#ifndef CONFIG_NFS_V3 + /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */ + error = -EPROTONOSUPPORT; + if (data->flags & NFS_MOUNT_VER3) { + dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__); + goto out_err_noserver; + } +#endif /* CONFIG_NFS_V3 */ + + error = -ENOMEM; + server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); + if (!server) + goto out_err_noserver; + /* Zero out the NFS state stuff */ + init_nfsv4_state(server); + server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); + + root = &server->fh; + if (data->flags & NFS_MOUNT_VER3) + root->size = data->root.size; + else + root->size = NFS2_FHSIZE; + error = -EINVAL; + if (root->size > sizeof(root->data)) { + dprintk("%s: invalid root filehandle\n", __FUNCTION__); + goto out_err; + } + memcpy(root->data, data->root.data, root->size); + + /* We now require that the mount process passes the remote address */ + memcpy(&server->addr, &data->addr, sizeof(server->addr)); + if (server->addr.sin_addr.s_addr == INADDR_ANY) { + dprintk("%s: mount program didn't pass remote address!\n", + __FUNCTION__); + goto out_err; + } + + /* Fire up rpciod if not yet running */ + error = rpciod_up(); + if (error < 0) { + dprintk("%s: couldn't start rpciod! Error = %d\n", + __FUNCTION__, error); + goto out_err; + } + + s = sget(fs_type, nfs_compare_super, nfs_set_super, server); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto out_err_rpciod; + } + + if (s->s_root) + goto out_rpciod_down; + + s->s_flags = flags; + + error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (error) { + up_write(&s->s_umount); + deactivate_super(s); + return error; + } + s->s_flags |= MS_ACTIVE; + return simple_set_mnt(mnt, s); + +out_rpciod_down: + rpciod_down(); + kfree(server); + return simple_set_mnt(mnt, s); + +out_err_rpciod: + rpciod_down(); +out_err: + kfree(server); +out_err_noserver: + return error; +} + +static void nfs_kill_super(struct super_block *s) +{ + struct nfs_server *server = NFS_SB(s); + + kill_anon_super(s); + + if (!IS_ERR(server->client)) + rpc_shutdown_client(server->client); + if (!IS_ERR(server->client_sys)) + rpc_shutdown_client(server->client_sys); + if (!IS_ERR(server->client_acl)) + rpc_shutdown_client(server->client_acl); + + if (!(server->flags & NFS_MOUNT_NONLM)) + lockd_down(); /* release rpc.lockd */ + + rpciod_down(); /* release rpciod */ + + nfs_free_iostats(server->io_stats); + kfree(server->hostname); + kfree(server); + nfs_release_automount_timer(); +} + +static struct super_block *nfs_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data) +{ + struct super_block *sb; + + server->fsid = data->fattr->fsid; + nfs_copy_fh(&server->fh, data->fh); + sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); + if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM)) + lockd_up(); + return sb; +} + +static int nfs_clone_nfs_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +{ + struct nfs_clone_mount *data = raw_data; + return nfs_clone_generic_sb(data, nfs_clone_sb, nfs_clone_server, mnt); +} + +#ifdef CONFIG_NFS_V4 +static struct rpc_clnt *nfs4_create_client(struct nfs_server *server, + struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor) +{ + struct nfs4_client *clp; + struct rpc_xprt *xprt = NULL; + struct rpc_clnt *clnt = NULL; + int err = -EIO; + + clp = nfs4_get_client(&server->addr.sin_addr); + if (!clp) { + dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__); + return ERR_PTR(err); + } + + /* Now create transport and client */ + down_write(&clp->cl_sem); + if (IS_ERR(clp->cl_rpcclient)) { + xprt = xprt_create_proto(proto, &server->addr, timeparms); + if (IS_ERR(xprt)) { + up_write(&clp->cl_sem); + err = PTR_ERR(xprt); + dprintk("%s: cannot create RPC transport. Error = %d\n", + __FUNCTION__, err); + goto out_fail; + } + /* Bind to a reserved port! */ + xprt->resvport = 1; + clnt = rpc_create_client(xprt, server->hostname, &nfs_program, + server->rpc_ops->version, flavor); + if (IS_ERR(clnt)) { + up_write(&clp->cl_sem); + err = PTR_ERR(clnt); + dprintk("%s: cannot create RPC client. Error = %d\n", + __FUNCTION__, err); + goto out_fail; + } + clnt->cl_intr = 1; + clnt->cl_softrtry = 1; + clp->cl_rpcclient = clnt; + memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr)); + nfs_idmap_new(clp); + } + list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks); + clnt = rpc_clone_client(clp->cl_rpcclient); + if (!IS_ERR(clnt)) + server->nfs4_state = clp; + up_write(&clp->cl_sem); + clp = NULL; + + if (IS_ERR(clnt)) { + dprintk("%s: cannot create RPC client. Error = %d\n", + __FUNCTION__, err); + return clnt; + } + + if (server->nfs4_state->cl_idmap == NULL) { + dprintk("%s: failed to create idmapper.\n", __FUNCTION__); + return ERR_PTR(-ENOMEM); + } + + if (clnt->cl_auth->au_flavor != flavor) { + struct rpc_auth *auth; + + auth = rpcauth_create(flavor, clnt); + if (IS_ERR(auth)) { + dprintk("%s: couldn't create credcache!\n", __FUNCTION__); + return (struct rpc_clnt *)auth; + } + } + return clnt; + + out_fail: + if (clp) + nfs4_put_client(clp); + return ERR_PTR(err); +} + +/* + * Set up an NFS4 superblock + */ +static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent) +{ + struct nfs_server *server; + struct rpc_timeout timeparms; + rpc_authflavor_t authflavour; + int err = -EIO; + + sb->s_blocksize_bits = 0; + sb->s_blocksize = 0; + server = NFS_SB(sb); + if (data->rsize != 0) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize != 0) + server->wsize = nfs_block_size(data->wsize, NULL); + server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->caps = NFS_CAP_ATOMIC_OPEN; + + server->acregmin = data->acregmin*HZ; + server->acregmax = data->acregmax*HZ; + server->acdirmin = data->acdirmin*HZ; + server->acdirmax = data->acdirmax*HZ; + + server->rpc_ops = &nfs_v4_clientops; + + nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans); + + server->retrans_timeo = timeparms.to_initval; + server->retrans_count = timeparms.to_retries; + + /* Now create transport and client */ + authflavour = RPC_AUTH_UNIX; + if (data->auth_flavourlen != 0) { + if (data->auth_flavourlen != 1) { + dprintk("%s: Invalid number of RPC auth flavours %d.\n", + __FUNCTION__, data->auth_flavourlen); + err = -EINVAL; + goto out_fail; + } + if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) { + err = -EFAULT; + goto out_fail; + } + } + + server->client = nfs4_create_client(server, &timeparms, data->proto, authflavour); + if (IS_ERR(server->client)) { + err = PTR_ERR(server->client); + dprintk("%s: cannot create RPC client. Error = %d\n", + __FUNCTION__, err); + goto out_fail; + } + + sb->s_time_gran = 1; + + sb->s_op = &nfs4_sops; + err = nfs_sb_init(sb, authflavour); + + out_fail: + return err; +} + +static int nfs4_compare_super(struct super_block *sb, void *data) +{ + struct nfs_server *server = data; + struct nfs_server *old = NFS_SB(sb); + + if (strcmp(server->hostname, old->hostname) != 0) + return 0; + if (strcmp(server->mnt_path, old->mnt_path) != 0) + return 0; + return 1; +} + +static void * +nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen) +{ + void *p = NULL; + + if (!src->len) + return ERR_PTR(-EINVAL); + if (src->len < maxlen) + maxlen = src->len; + if (dst == NULL) { + p = dst = kmalloc(maxlen + 1, GFP_KERNEL); + if (p == NULL) + return ERR_PTR(-ENOMEM); + } + if (copy_from_user(dst, src->data, maxlen)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + dst[maxlen] = '\0'; + return dst; +} + +static int nfs4_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +{ + int error; + struct nfs_server *server; + struct super_block *s; + struct nfs4_mount_data *data = raw_data; + void *p; + + if (data == NULL) { + dprintk("%s: missing data argument\n", __FUNCTION__); + return -EINVAL; + } + if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) { + dprintk("%s: bad mount version\n", __FUNCTION__); + return -EINVAL; + } + + server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); + if (!server) + return -ENOMEM; + /* Zero out the NFS state stuff */ + init_nfsv4_state(server); + server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); + + p = nfs_copy_user_string(NULL, &data->hostname, 256); + if (IS_ERR(p)) + goto out_err; + server->hostname = p; + + p = nfs_copy_user_string(NULL, &data->mnt_path, 1024); + if (IS_ERR(p)) + goto out_err; + server->mnt_path = p; + + p = nfs_copy_user_string(server->ip_addr, &data->client_addr, + sizeof(server->ip_addr) - 1); + if (IS_ERR(p)) + goto out_err; + + /* We now require that the mount process passes the remote address */ + if (data->host_addrlen != sizeof(server->addr)) { + error = -EINVAL; + goto out_free; + } + if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) { + error = -EFAULT; + goto out_free; + } + if (server->addr.sin_family != AF_INET || + server->addr.sin_addr.s_addr == INADDR_ANY) { + dprintk("%s: mount program didn't pass remote IP address!\n", + __FUNCTION__); + error = -EINVAL; + goto out_free; + } + + /* Fire up rpciod if not yet running */ + error = rpciod_up(); + if (error < 0) { + dprintk("%s: couldn't start rpciod! Error = %d\n", + __FUNCTION__, error); + goto out_free; + } + + s = sget(fs_type, nfs4_compare_super, nfs_set_super, server); + + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto out_free; + } + + if (s->s_root) { + kfree(server->mnt_path); + kfree(server->hostname); + kfree(server); + return simple_set_mnt(mnt, s); + } + + s->s_flags = flags; + + error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0); + if (error) { + up_write(&s->s_umount); + deactivate_super(s); + return error; + } + s->s_flags |= MS_ACTIVE; + return simple_set_mnt(mnt, s); +out_err: + error = PTR_ERR(p); +out_free: + kfree(server->mnt_path); + kfree(server->hostname); + kfree(server); + return error; +} + +static void nfs4_kill_super(struct super_block *sb) +{ + struct nfs_server *server = NFS_SB(sb); + + nfs_return_all_delegations(sb); + kill_anon_super(sb); + + nfs4_renewd_prepare_shutdown(server); + + if (server->client != NULL && !IS_ERR(server->client)) + rpc_shutdown_client(server->client); + + destroy_nfsv4_state(server); + + rpciod_down(); + + nfs_free_iostats(server->io_stats); + kfree(server->hostname); + kfree(server); + nfs_release_automount_timer(); +} + +/* + * Constructs the SERVER-side path + */ +static inline char *nfs4_dup_path(const struct dentry *dentry) +{ + char *page = (char *) __get_free_page(GFP_USER); + char *path; + + path = nfs4_path(dentry, page, PAGE_SIZE); + if (!IS_ERR(path)) { + int len = PAGE_SIZE + page - path; + char *tmp = path; + + path = kmalloc(len, GFP_KERNEL); + if (path) + memcpy(path, tmp, len); + else + path = ERR_PTR(-ENOMEM); + } + free_page((unsigned long)page); + return path; +} + +static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data) +{ + const struct dentry *dentry = data->dentry; + struct nfs4_client *clp = server->nfs4_state; + struct super_block *sb; + + server->fsid = data->fattr->fsid; + nfs_copy_fh(&server->fh, data->fh); + server->mnt_path = nfs4_dup_path(dentry); + if (IS_ERR(server->mnt_path)) { + sb = (struct super_block *)server->mnt_path; + goto err; + } + sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server); + if (IS_ERR(sb) || sb->s_root) + goto free_path; + nfs4_server_capabilities(server, &server->fh); + + down_write(&clp->cl_sem); + atomic_inc(&clp->cl_count); + list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks); + up_write(&clp->cl_sem); + return sb; +free_path: + kfree(server->mnt_path); +err: + server->mnt_path = NULL; + return sb; +} + +static int nfs_clone_nfs4_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +{ + struct nfs_clone_mount *data = raw_data; + return nfs_clone_generic_sb(data, nfs4_clone_sb, nfs_clone_server, mnt); +} + +static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data) +{ + struct super_block *sb = ERR_PTR(-ENOMEM); + int len; + + len = strlen(data->mnt_path) + 1; + server->mnt_path = kmalloc(len, GFP_KERNEL); + if (server->mnt_path == NULL) + goto err; + memcpy(server->mnt_path, data->mnt_path, len); + memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in)); + + sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server); + if (IS_ERR(sb) || sb->s_root) + goto free_path; + return sb; +free_path: + kfree(server->mnt_path); +err: + server->mnt_path = NULL; + return sb; +} + +static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data) +{ + struct nfs_server *server = NFS_SB(sb); + struct rpc_timeout timeparms; + int proto, timeo, retrans; + void *err; + + proto = IPPROTO_TCP; + /* Since we are following a referral and there may be alternatives, + set the timeouts and retries to low values */ + timeo = 2; + retrans = 1; + nfs_init_timeout_values(&timeparms, proto, timeo, retrans); + + server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor); + if (IS_ERR((err = server->client))) + goto out_err; + + sb->s_time_gran = 1; + sb->s_op = &nfs4_sops; + err = ERR_PTR(nfs_sb_init(sb, data->authflavor)); + if (!IS_ERR(err)) + return server; +out_err: + return (struct nfs_server *)err; +} + +static int nfs_referral_nfs4_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +{ + struct nfs_clone_mount *data = raw_data; + return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server, mnt); +} + +#endif diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 18dc95b0b646..600bbe630abd 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -52,7 +52,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; struct page *page; - void *err = ERR_PTR(nfs_revalidate_inode(NFS_SERVER(inode), inode)); + void *err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) goto read_failed; page = read_cache_page(&inode->i_data, 0, @@ -75,22 +75,13 @@ read_failed: return NULL; } -static void nfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) -{ - if (cookie) { - struct page *page = cookie; - kunmap(page); - page_cache_release(page); - } -} - /* * symlinks can't do much... */ struct inode_operations nfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = nfs_follow_link, - .put_link = nfs_put_link, + .put_link = page_put_link, .getattr = nfs_getattr, .setattr = nfs_setattr, }; diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 4c486eb867ca..db61e51bb154 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/nfs4.h> #include <linux/nfs_idmap.h> +#include <linux/nfs_fs.h> #include "callback.h" @@ -46,6 +47,15 @@ static ctl_table nfs_cb_sysctls[] = { .strategy = &sysctl_jiffies, }, #endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nfs_mountpoint_timeout", + .data = &nfs_mountpoint_expiry_timeout, + .maxlen = sizeof(nfs_mountpoint_expiry_timeout), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, { .ctl_name = 0 } }; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4cfada2cc09f..8fccb9cb173b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -98,11 +98,10 @@ struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount) if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); - if (pagecount < NFS_PAGEVEC_SIZE) - p->pagevec = &p->page_array[0]; + if (pagecount <= ARRAY_SIZE(p->page_array)) + p->pagevec = p->page_array; else { - size_t size = ++pagecount * sizeof(struct page *); - p->pagevec = kzalloc(size, GFP_NOFS); + p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); if (!p->pagevec) { mempool_free(p, nfs_commit_mempool); p = NULL; @@ -126,14 +125,11 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); - if (pagecount < NFS_PAGEVEC_SIZE) - p->pagevec = &p->page_array[0]; + if (pagecount <= ARRAY_SIZE(p->page_array)) + p->pagevec = p->page_array; else { - size_t size = ++pagecount * sizeof(struct page *); - p->pagevec = kmalloc(size, GFP_NOFS); - if (p->pagevec) { - memset(p->pagevec, 0, size); - } else { + p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); + if (!p->pagevec) { mempool_free(p, nfs_wdata_mempool); p = NULL; } @@ -583,6 +579,17 @@ static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, un return ret; } +static void nfs_cancel_requests(struct list_head *head) +{ + struct nfs_page *req; + while(!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_inode_remove_request(req); + nfs_clear_page_writeback(req); + } +} + /* * nfs_scan_dirty - Scan an inode for dirty requests * @inode: NFS inode to scan @@ -627,7 +634,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st int res = 0; if (nfsi->ncommit != 0) { - res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages); + res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages); nfsi->ncommit -= res; if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); @@ -1495,15 +1502,25 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start, pages = nfs_scan_dirty(inode, &head, idx_start, npages); if (pages != 0) { spin_unlock(&nfsi->req_lock); - ret = nfs_flush_list(inode, &head, pages, how); + if (how & FLUSH_INVALIDATE) + nfs_cancel_requests(&head); + else + ret = nfs_flush_list(inode, &head, pages, how); spin_lock(&nfsi->req_lock); continue; } if (nocommit) break; - pages = nfs_scan_commit(inode, &head, 0, 0); + pages = nfs_scan_commit(inode, &head, idx_start, npages); if (pages == 0) break; + if (how & FLUSH_INVALIDATE) { + spin_unlock(&nfsi->req_lock); + nfs_cancel_requests(&head); + spin_lock(&nfsi->req_lock); + continue; + } + pages += nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&nfsi->req_lock); ret = nfs_commit_list(inode, &head, how); spin_lock(&nfsi->req_lock); @@ -1512,7 +1529,7 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start, return ret; } -int nfs_init_writepagecache(void) +int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", sizeof(struct nfs_write_data), diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 96c7578cbe1e..7c7d01672d35 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -123,7 +123,7 @@ static void release_stateid(struct nfs4_stateid *stp, int flags); */ /* recall_lock protects the del_recall_lru */ -static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(recall_lock); static struct list_head del_recall_lru; static void @@ -529,8 +529,7 @@ move_to_confirmed(struct nfs4_client *clp) dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_del_init(&clp->cl_strhash); - list_del_init(&clp->cl_idhash); - list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); + list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); strhashval = clientstr_hashval(clp->cl_recdir); list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); renew_client(clp); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index d852ebb538e3..fdf7cf3dfadc 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -103,8 +103,7 @@ nfsd_cache_shutdown(void) static void lru_put_end(struct svc_cacherep *rp) { - list_del(&rp->c_lru); - list_add_tail(&rp->c_lru, &lru_head); + list_move_tail(&rp->c_lru, &lru_head); } /* diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 88292f9e4b9b..2e42c2dcae12 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1358,7 +1358,7 @@ err_out: goto out; } -static size_t __ntfs_copy_from_user_iovec(char *vaddr, +static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr, const struct iovec *iov, size_t iov_ofs, size_t bytes) { size_t total = 0; @@ -1376,10 +1376,6 @@ static size_t __ntfs_copy_from_user_iovec(char *vaddr, bytes -= len; vaddr += len; if (unlikely(left)) { - /* - * Zero the rest of the target like __copy_from_user(). - */ - memset(vaddr, 0, bytes); total -= left; break; } @@ -1420,11 +1416,13 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp, * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s * single-segment behaviour. * - * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and - * when not atomic. This is ok because __ntfs_copy_from_user_iovec() calls - * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In - * fact, the only difference between __copy_from_user_inatomic() and - * __copy_from_user() is that the latter calls might_sleep(). And on many + * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both + * when atomic and when not atomic. This is ok because + * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic() + * and it is ok to call this when non-atomic. + * Infact, the only difference between __copy_from_user_inatomic() and + * __copy_from_user() is that the latter calls might_sleep() and the former + * should not zero the tail of the buffer on error. And on many * architectures __copy_from_user_inatomic() is just defined to * __copy_from_user() so it makes no difference at all on those architectures. */ @@ -1441,14 +1439,18 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages, if (len > bytes) len = bytes; kaddr = kmap_atomic(*pages, KM_USER0); - copied = __ntfs_copy_from_user_iovec(kaddr + ofs, + copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs, *iov, *iov_ofs, len); kunmap_atomic(kaddr, KM_USER0); if (unlikely(copied != len)) { /* Do it the slow way. */ kaddr = kmap(*pages); - copied = __ntfs_copy_from_user_iovec(kaddr + ofs, + copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs, *iov, *iov_ofs, len); + /* + * Zero the rest of the target like __copy_from_user(). + */ + memset(kaddr + ofs + copied, 0, len - copied); kunmap(*pages); if (unlikely(copied != len)) goto err_out; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 21f38accd039..1d26cfcd9f84 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -54,7 +54,7 @@ static DECLARE_RWSEM(o2hb_callback_sem); * multiple hb threads are watching multiple regions. A node is live * whenever any of the threads sees activity from the node in its region. */ -static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(o2hb_live_lock); static struct list_head o2hb_live_slots[O2NM_MAX_NODES]; static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; static LIST_HEAD(o2hb_node_events); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 0f60cc0d3985..1591eb37a723 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -108,7 +108,7 @@ ##args); \ } while (0) -static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(o2net_handler_lock); static struct rb_root o2net_handler_tree = RB_ROOT; static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 355593dd8ef8..42775e2bbe2c 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, lock->ml.node == dlm->node_num ? "master" : "remote"); memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); - } else if (lksb->flags & DLM_LKSB_PUT_LVB) { - mlog(0, "setting lvb from lockres for %s node\n", - lock->ml.node == dlm->node_num ? "master" : - "remote"); - memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); } + /* Do nothing for lvb put requests - they should be done in + * place when the lock is downconverted - otherwise we risk + * racing gets and puts which could result in old lvb data + * being propagated. We leave the put flag set and clear it + * here. In the future we might want to clear it at the time + * the put is actually done. + */ spin_unlock(&res->spinlock); } @@ -381,8 +383,7 @@ do_ast: ret = DLM_NORMAL; if (past->type == DLM_AST) { /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); mlog(0, "ast: adding to granted list... type=%d, " "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); if (lock->ml.convert_type != LKM_IVMODE) { diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 88cc43df18f1..9bdc9cf65991 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -37,7 +37,17 @@ #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes #define DLM_THREAD_MS 200 // flush at least every 200 ms -#define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head)) +#define DLM_HASH_SIZE_DEFAULT (1 << 14) +#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE +# define DLM_HASH_PAGES 1 +#else +# define DLM_HASH_PAGES (DLM_HASH_SIZE_DEFAULT / PAGE_SIZE) +#endif +#define DLM_BUCKETS_PER_PAGE (PAGE_SIZE / sizeof(struct hlist_head)) +#define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE) + +/* Intended to make it easier for us to switch out hash functions */ +#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) enum dlm_ast_type { DLM_AST = 0, @@ -61,7 +71,8 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) return 0; } -#define DLM_RECO_STATE_ACTIVE 0x0001 +#define DLM_RECO_STATE_ACTIVE 0x0001 +#define DLM_RECO_STATE_FINALIZE 0x0002 struct dlm_recovery_ctxt { @@ -85,7 +96,7 @@ enum dlm_ctxt_state { struct dlm_ctxt { struct list_head list; - struct hlist_head *lockres_hash; + struct hlist_head **lockres_hash; struct list_head dirty_list; struct list_head purge_list; struct list_head pending_asts; @@ -120,6 +131,7 @@ struct dlm_ctxt struct o2hb_callback_func dlm_hb_down; struct task_struct *dlm_thread_task; struct task_struct *dlm_reco_thread_task; + struct workqueue_struct *dlm_worker; wait_queue_head_t dlm_thread_wq; wait_queue_head_t dlm_reco_thread_wq; wait_queue_head_t ast_wq; @@ -132,6 +144,11 @@ struct dlm_ctxt struct list_head dlm_eviction_callbacks; }; +static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i) +{ + return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); +} + /* these keventd work queue items are for less-frequently * called functions that cannot be directly called from the * net message handlers for some reason, usually because @@ -216,20 +233,29 @@ struct dlm_lock_resource /* WARNING: Please see the comment in dlm_init_lockres before * adding fields here. */ struct hlist_node hash_node; + struct qstr lockname; struct kref refs; - /* please keep these next 3 in this order - * some funcs want to iterate over all lists */ + /* + * Please keep granted, converting, and blocked in this order, + * as some funcs want to iterate over all lists. + * + * All four lists are protected by the hash's reference. + */ struct list_head granted; struct list_head converting; struct list_head blocked; + struct list_head purge; + /* + * These two lists require you to hold an additional reference + * while they are on the list. + */ struct list_head dirty; struct list_head recovering; // dlm_recovery_ctxt.resources list /* unused lock resources have their last_used stamped and are * put on a list for the dlm thread to run. */ - struct list_head purge; unsigned long last_used; unsigned migration_pending:1; @@ -238,7 +264,6 @@ struct dlm_lock_resource wait_queue_head_t wq; u8 owner; //node which owns the lock resource, or unknown u16 state; - struct qstr lockname; char lvb[DLM_LVB_LEN]; }; @@ -300,6 +325,15 @@ enum dlm_lockres_list { DLM_BLOCKED_LIST }; +static inline int dlm_lvb_is_empty(char *lvb) +{ + int i; + for (i=0; i<DLM_LVB_LEN; i++) + if (lvb[i]) + return 0; + return 1; +} + static inline struct list_head * dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) { @@ -609,7 +643,8 @@ struct dlm_finalize_reco { u8 node_idx; u8 dead_node; - __be16 pad1; + u8 flags; + u8 pad1; __be32 pad2; }; @@ -676,6 +711,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm); void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); +int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); void dlm_put(struct dlm_ctxt *dlm); struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); @@ -687,14 +723,20 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres); -void dlm_lockres_get(struct dlm_lock_resource *res); +static inline void dlm_lockres_get(struct dlm_lock_resource *res) +{ + /* This is called on every lookup, so it might be worth + * inlining. */ + kref_get(&res->refs); +} void dlm_lockres_put(struct dlm_lock_resource *res); void __dlm_unhash_lockres(struct dlm_lock_resource *res); void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, - unsigned int len); + unsigned int len, + unsigned int hash); struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len); @@ -819,6 +861,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); +int __dlm_lockres_unused(struct dlm_lock_resource *res); static inline const char * dlm_lock_mode_name(int mode) { diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 8285228d9e37..c764dc8e40a2 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -214,6 +214,9 @@ grant: if (lock->ml.node == dlm->node_num) mlog(0, "doing in-place convert for nonlocal lock\n"); lock->ml.type = type; + if (lock->lksb->flags & DLM_LKSB_PUT_LVB) + memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); + status = DLM_NORMAL; *call_ast = 1; goto unlock_exit; @@ -231,8 +234,7 @@ switch_queues: lock->ml.convert_type = type; /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->converting); + list_move_tail(&lock->list, &res->converting); unlock_exit: spin_unlock(&lock->spinlock); @@ -248,8 +250,7 @@ void dlm_revert_pending_convert(struct dlm_lock_resource *res, struct dlm_lock *lock) { /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); lock->ml.convert_type = LKM_IVMODE; lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); } @@ -294,8 +295,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, res->state |= DLM_LOCK_RES_IN_PROGRESS; /* move lock to local convert queue */ /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->converting); + list_move_tail(&lock->list, &res->converting); lock->convert_pending = 1; lock->ml.convert_type = type; @@ -464,6 +464,12 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + if (status != DLM_NORMAL) { + spin_unlock(&res->spinlock); + dlm_error(status); + goto leave; + } list_for_each(iter, &res->granted) { lock = list_entry(iter, struct dlm_lock, list); if (lock->ml.cookie == cnv->cookie && @@ -473,6 +479,21 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } lock = NULL; } + if (!lock) { + __dlm_print_one_lock_resource(res); + list_for_each(iter, &res->granted) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock->ml.node == cnv->node_idx) { + mlog(ML_ERROR, "There is something here " + "for node %u, lock->ml.cookie=%llu, " + "cnv->cookie=%llu\n", cnv->node_idx, + (unsigned long long)lock->ml.cookie, + (unsigned long long)cnv->cookie); + break; + } + } + lock = NULL; + } spin_unlock(&res->spinlock); if (!lock) { status = DLM_IVLOCKID; diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index c7eae5d3324e..3f6c8d88f7af 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -37,10 +37,8 @@ #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" -#include "dlmdebug.h" #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" @@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock *lockid) } EXPORT_SYMBOL_GPL(dlm_print_one_lock); +#if 0 void dlm_dump_lock_resources(struct dlm_ctxt *dlm) { struct dlm_lock_resource *res; @@ -136,12 +135,13 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm) spin_lock(&dlm->spinlock); for (i=0; i<DLM_HASH_BUCKETS; i++) { - bucket = &(dlm->lockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, iter, bucket, hash_node) dlm_print_one_lock_resource(res); } spin_unlock(&dlm->spinlock); } +#endif /* 0 */ static const char *dlm_errnames[] = { [DLM_NORMAL] = "DLM_NORMAL", diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h deleted file mode 100644 index 6858510c3ccd..000000000000 --- a/fs/ocfs2/dlm/dlmdebug.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmdebug.h - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef DLMDEBUG_H -#define DLMDEBUG_H - -void dlm_dump_lock_resources(struct dlm_ctxt *dlm); - -#endif diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8f3a9e3106fd..b8c23f7ba67e 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -41,7 +41,6 @@ #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" #include "dlmver.h" @@ -49,6 +48,33 @@ #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" +static void dlm_free_pagevec(void **vec, int pages) +{ + while (pages--) + free_page((unsigned long)vec[pages]); + kfree(vec); +} + +static void **dlm_alloc_pagevec(int pages) +{ + void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL); + int i; + + if (!vec) + return NULL; + + for (i = 0; i < pages; i++) + if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL))) + goto out_free; + + mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n", + pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE); + return vec; +out_free: + dlm_free_pagevec(vec, i); + return NULL; +} + /* * * spinlock lock ordering: if multiple locks are needed, obey this ordering: @@ -62,7 +88,7 @@ * */ -spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(dlm_domain_lock); LIST_HEAD(dlm_domains); static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); @@ -90,8 +116,7 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, assert_spin_locked(&dlm->spinlock); q = &res->lockname; - q->hash = full_name_hash(q->name, q->len); - bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]); + bucket = dlm_lockres_hash(dlm, q->hash); /* get a reference for our hashtable */ dlm_lockres_get(res); @@ -100,34 +125,32 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, } struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, - const char *name, - unsigned int len) + const char *name, + unsigned int len, + unsigned int hash) { - unsigned int hash; - struct hlist_node *iter; - struct dlm_lock_resource *tmpres=NULL; struct hlist_head *bucket; + struct hlist_node *list; mlog_entry("%.*s\n", len, name); assert_spin_locked(&dlm->spinlock); - hash = full_name_hash(name, len); - - bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]); - - /* check for pre-existing lock */ - hlist_for_each(iter, bucket) { - tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node); - if (tmpres->lockname.len == len && - memcmp(tmpres->lockname.name, name, len) == 0) { - dlm_lockres_get(tmpres); - break; - } + bucket = dlm_lockres_hash(dlm, hash); - tmpres = NULL; + hlist_for_each(list, bucket) { + struct dlm_lock_resource *res = hlist_entry(list, + struct dlm_lock_resource, hash_node); + if (res->lockname.name[0] != name[0]) + continue; + if (unlikely(res->lockname.len != len)) + continue; + if (memcmp(res->lockname.name + 1, name + 1, len - 1)) + continue; + dlm_lockres_get(res); + return res; } - return tmpres; + return NULL; } struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, @@ -135,9 +158,10 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, unsigned int len) { struct dlm_lock_resource *res; + unsigned int hash = dlm_lockid_hash(name, len); spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, len); + res = __dlm_lookup_lockres(dlm, name, len, hash); spin_unlock(&dlm->spinlock); return res; } @@ -194,7 +218,7 @@ static int dlm_wait_on_domain_helper(const char *domain) static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) { if (dlm->lockres_hash) - free_page((unsigned long) dlm->lockres_hash); + dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); if (dlm->name) kfree(dlm->name); @@ -278,11 +302,21 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm) return ret; } +static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_worker) { + flush_workqueue(dlm->dlm_worker); + destroy_workqueue(dlm->dlm_worker); + dlm->dlm_worker = NULL; + } +} + static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) { dlm_unregister_domain_handlers(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); /* We've left the domain. Now we can take ourselves out of the * list and allow the kref stuff to help us free the @@ -304,8 +338,8 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) restart: spin_lock(&dlm->spinlock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { - while (!hlist_empty(&dlm->lockres_hash[i])) { - res = hlist_entry(dlm->lockres_hash[i].first, + while (!hlist_empty(dlm_lockres_hash(dlm, i))) { + res = hlist_entry(dlm_lockres_hash(dlm, i)->first, struct dlm_lock_resource, hash_node); /* need reference when manually grabbing lockres */ dlm_lockres_get(res); @@ -1126,6 +1160,13 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } + dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); + if (!dlm->dlm_worker) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + do { unsigned int backoff; status = dlm_try_to_join_domain(dlm); @@ -1166,6 +1207,7 @@ bail: dlm_unregister_domain_handlers(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); } return status; @@ -1191,7 +1233,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, goto leave; } - dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL); + dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); if (!dlm->lockres_hash) { mlog_errno(-ENOMEM); kfree(dlm->name); @@ -1200,8 +1242,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, goto leave; } - for (i=0; i<DLM_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dlm->lockres_hash[i]); + for (i = 0; i < DLM_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); strcpy(dlm->name, domain); dlm->key = key; @@ -1231,6 +1273,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->dlm_thread_task = NULL; dlm->dlm_reco_thread_task = NULL; + dlm->dlm_worker = NULL; init_waitqueue_head(&dlm->dlm_thread_wq); init_waitqueue_head(&dlm->dlm_reco_thread_wq); init_waitqueue_head(&dlm->reco.event); diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 7273d9fa6bab..033ad1701232 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode *inode, * doesn't make sense for LVB writes. */ file->f_flags &= ~O_APPEND; - fp = kmalloc(sizeof(*fp), GFP_KERNEL); + fp = kmalloc(sizeof(*fp), GFP_NOFS); if (!fp) { status = -ENOMEM; goto bail; @@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct file *filp, else readlen = count - *ppos; - lvb_buf = kmalloc(readlen, GFP_KERNEL); + lvb_buf = kmalloc(readlen, GFP_NOFS); if (!lvb_buf) return -ENOMEM; @@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct file *filp, else writelen = count - *ppos; - lvb_buf = kmalloc(writelen, GFP_KERNEL); + lvb_buf = kmalloc(writelen, GFP_NOFS); if (!lvb_buf) return -ENOMEM; diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 6fea28318d6d..5ca57ec650c7 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -53,7 +53,7 @@ #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" -static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(dlm_cookie_lock); static u64 dlm_next_cookie = 1; static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, @@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, struct dlm_lock *lock, int flags) { enum dlm_status status = DLM_DENIED; + int lockres_changed = 1; mlog_entry("type=%d\n", lock->ml.type); mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len, @@ -226,8 +227,25 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, res->state &= ~DLM_LOCK_RES_IN_PROGRESS; lock->lock_pending = 0; if (status != DLM_NORMAL) { - if (status != DLM_NOTQUEUED) + if (status == DLM_RECOVERING && + dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* recovery lock was mastered by dead node. + * we need to have calc_usage shoot down this + * lockres and completely remaster it. */ + mlog(0, "%s: recovery lock was owned by " + "dead node %u, remaster it now.\n", + dlm->name, res->owner); + } else if (status != DLM_NOTQUEUED) { + /* + * DO NOT call calc_usage, as this would unhash + * the remote lockres before we ever get to use + * it. treat as if we never made any change to + * the lockres. + */ + lockres_changed = 0; dlm_error(status); + } dlm_revert_pending_lock(res, lock); dlm_lock_put(lock); } else if (dlm_is_recovery_lock(res->lockname.name, @@ -239,12 +257,12 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, mlog(0, "%s: $RECOVERY lock for this node (%u) is " "mastered by %u; got lock, manually granting (no ast)\n", dlm->name, dlm->node_num, res->owner); - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); } spin_unlock(&res->spinlock); - dlm_lockres_calc_usage(dlm, res); + if (lockres_changed) + dlm_lockres_calc_usage(dlm, res); wake_up(&res->wq); return status; @@ -281,6 +299,14 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, if (tmpret >= 0) { // successfully sent and received ret = status; // this is already a dlm_status + if (ret == DLM_REJECTED) { + mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " + "no longer owned by %u. that node is coming back " + "up currently.\n", dlm->name, create.namelen, + create.name, res->owner); + dlm_print_one_lock_resource(res); + BUG(); + } } else { mlog_errno(tmpret); if (dlm_is_host_down(tmpret)) { @@ -382,13 +408,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, struct dlm_lock *lock; int kernel_allocated = 0; - lock = kcalloc(1, sizeof(*lock), GFP_KERNEL); + lock = kcalloc(1, sizeof(*lock), GFP_NOFS); if (!lock) return NULL; if (!lksb) { /* zero memory only if kernel-allocated */ - lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL); + lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS); if (!lksb) { kfree(lock); return NULL; @@ -429,11 +455,16 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data) if (!dlm_grab(dlm)) return DLM_REJECTED; - mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), - "Domain %s not fully joined!\n", dlm->name); - name = create->name; namelen = create->namelen; + status = DLM_REJECTED; + if (!dlm_domain_fully_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not fully joined, but node %u is " + "sending a create_lock message for lock %.*s!\n", + dlm->name, create->node_idx, namelen, name); + dlm_error(status); + goto leave; + } status = DLM_IVBUFLEN; if (namelen > DLM_LOCKID_NAME_MAX) { @@ -669,18 +700,22 @@ retry_lock: msleep(100); /* no waiting for dlm_reco_thread */ if (recovery) { - if (status == DLM_RECOVERING) { - mlog(0, "%s: got RECOVERING " - "for $REOCVERY lock, master " - "was %u\n", dlm->name, - res->owner); - dlm_wait_for_node_death(dlm, res->owner, - DLM_NODE_DEATH_WAIT_MAX); - } + if (status != DLM_RECOVERING) + goto retry_lock; + + mlog(0, "%s: got RECOVERING " + "for $RECOVERY lock, master " + "was %u\n", dlm->name, + res->owner); + /* wait to see the node go down, then + * drop down and allow the lockres to + * get cleaned up. need to remaster. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); } else { dlm_wait_for_recovery(dlm); + goto retry_lock; } - goto retry_lock; } if (status != DLM_NORMAL) { diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 940be4c13b1f..1b8346dd0572 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -47,7 +47,6 @@ #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) @@ -74,6 +73,7 @@ struct dlm_master_list_entry wait_queue_head_t wq; atomic_t woken; struct kref mle_refs; + int inuse; unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -127,18 +127,30 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, return 1; } -#if 0 -/* Code here is included but defined out as it aids debugging */ +#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m) +static void _dlm_print_nodemap(unsigned long *map, const char *mapname) +{ + int i; + printk("%s=[ ", mapname); + for (i=0; i<O2NM_MAX_NODES; i++) + if (test_bit(i, map)) + printk("%d ", i); + printk("]"); +} -void dlm_print_one_mle(struct dlm_master_list_entry *mle) +static void dlm_print_one_mle(struct dlm_master_list_entry *mle) { - int i = 0, refs; + int refs; char *type; char attached; u8 master; unsigned int namelen; const char *name; struct kref *k; + unsigned long *maybe = mle->maybe_map, + *vote = mle->vote_map, + *resp = mle->response_map, + *node = mle->node_map; k = &mle->mle_refs; if (mle->type == DLM_MLE_BLOCK) @@ -159,18 +171,29 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) name = mle->u.res->lockname.name; } - mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n", - i, type, refs, master, mle->new_master, attached, - namelen, namelen, name); + mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ", + namelen, name, type, refs, master, mle->new_master, attached, + mle->inuse); + dlm_print_nodemap(maybe); + printk(", "); + dlm_print_nodemap(vote); + printk(", "); + dlm_print_nodemap(resp); + printk(", "); + dlm_print_nodemap(node); + printk(", "); + printk("\n"); } +#if 0 +/* Code here is included but defined out as it aids debugging */ + static void dlm_dump_mles(struct dlm_ctxt *dlm) { struct dlm_master_list_entry *mle; struct list_head *iter; mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); - mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n"); spin_lock(&dlm->master_lock); list_for_each(iter, &dlm->master_list) { mle = list_entry(iter, struct dlm_master_list_entry, list); @@ -314,6 +337,31 @@ static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, spin_unlock(&dlm->spinlock); } +static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + mle->inuse++; + kref_get(&mle->mle_refs); +} + +static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + mle->inuse--; + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + +} + /* remove from list and free */ static void __dlm_put_mle(struct dlm_master_list_entry *mle) { @@ -322,9 +370,14 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle) assert_spin_locked(&dlm->spinlock); assert_spin_locked(&dlm->master_lock); - BUG_ON(!atomic_read(&mle->mle_refs.refcount)); - - kref_put(&mle->mle_refs, dlm_mle_release); + if (!atomic_read(&mle->mle_refs.refcount)) { + /* this may or may not crash, but who cares. + * it's a BUG. */ + mlog(ML_ERROR, "bad mle: %p\n", mle); + dlm_print_one_mle(mle); + BUG(); + } else + kref_put(&mle->mle_refs, dlm_mle_release); } @@ -367,6 +420,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, memset(mle->response_map, 0, sizeof(mle->response_map)); mle->master = O2NM_MAX_NODES; mle->new_master = O2NM_MAX_NODES; + mle->inuse = 0; if (mle->type == DLM_MLE_MASTER) { BUG_ON(!res); @@ -564,6 +618,28 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); + if (!hlist_unhashed(&res->hash_node) || + !list_empty(&res->granted) || + !list_empty(&res->converting) || + !list_empty(&res->blocked) || + !list_empty(&res->dirty) || + !list_empty(&res->recovering) || + !list_empty(&res->purge)) { + mlog(ML_ERROR, + "Going to BUG for resource %.*s." + " We're on a list! [%c%c%c%c%c%c%c]\n", + res->lockname.len, res->lockname.name, + !hlist_unhashed(&res->hash_node) ? 'H' : ' ', + !list_empty(&res->granted) ? 'G' : ' ', + !list_empty(&res->converting) ? 'C' : ' ', + !list_empty(&res->blocked) ? 'B' : ' ', + !list_empty(&res->dirty) ? 'D' : ' ', + !list_empty(&res->recovering) ? 'R' : ' ', + !list_empty(&res->purge) ? 'P' : ' '); + + dlm_print_one_lock_resource(res); + } + /* By the time we're ready to blow this guy away, we shouldn't * be on any lists. */ BUG_ON(!hlist_unhashed(&res->hash_node)); @@ -579,11 +655,6 @@ static void dlm_lockres_release(struct kref *kref) kfree(res); } -void dlm_lockres_get(struct dlm_lock_resource *res) -{ - kref_get(&res->refs); -} - void dlm_lockres_put(struct dlm_lock_resource *res) { kref_put(&res->refs, dlm_lockres_release); @@ -603,7 +674,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, memcpy(qname, name, namelen); res->lockname.len = namelen; - res->lockname.hash = full_name_hash(name, namelen); + res->lockname.hash = dlm_lockid_hash(name, namelen); init_waitqueue_head(&res->wq); spin_lock_init(&res->spinlock); @@ -637,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, { struct dlm_lock_resource *res; - res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); + res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS); if (!res) return NULL; - res->lockname.name = kmalloc(namelen, GFP_KERNEL); + res->lockname.name = kmalloc(namelen, GFP_NOFS); if (!res->lockname.name) { kfree(res); return NULL; @@ -677,19 +748,20 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, int blocked = 0; int ret, nodenum; struct dlm_node_iter iter; - unsigned int namelen; + unsigned int namelen, hash; int tries = 0; int bit, wait_on_recovery = 0; BUG_ON(!lockid); namelen = strlen(lockid); + hash = dlm_lockid_hash(lockid, namelen); mlog(0, "get lockres %s (len %d)\n", lockid, namelen); lookup: spin_lock(&dlm->spinlock); - tmpres = __dlm_lookup_lockres(dlm, lockid, namelen); + tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); if (tmpres) { spin_unlock(&dlm->spinlock); mlog(0, "found in hash!\n"); @@ -704,7 +776,7 @@ lookup: mlog(0, "allocating a new resource\n"); /* nothing found and we need to allocate one. */ alloc_mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); + kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!alloc_mle) goto leave; res = dlm_new_lockres(dlm, lockid, namelen); @@ -790,10 +862,11 @@ lookup: * if so, the creator of the BLOCK may try to put the last * ref at this time in the assert master handler, so we * need an extra one to keep from a bad ptr deref. */ - dlm_get_mle(mle); + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); +redo_request: while (wait_on_recovery) { /* any cluster changes that occurred after dropping the * dlm spinlock would be detectable be a change on the mle, @@ -812,7 +885,7 @@ lookup: } dlm_kick_recovery_thread(dlm); - msleep(100); + msleep(1000); dlm_wait_for_recovery(dlm); spin_lock(&dlm->spinlock); @@ -825,13 +898,15 @@ lookup: } else wait_on_recovery = 0; spin_unlock(&dlm->spinlock); + + if (wait_on_recovery) + dlm_wait_for_node_recovery(dlm, bit, 10000); } /* must wait for lock to be mastered elsewhere */ if (blocked) goto wait; -redo_request: ret = -EINVAL; dlm_node_iter_init(mle->vote_map, &iter); while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { @@ -856,6 +931,7 @@ wait: /* keep going until the response map includes all nodes */ ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); if (ret < 0) { + wait_on_recovery = 1; mlog(0, "%s:%.*s: node map changed, redo the " "master request now, blocked=%d\n", dlm->name, res->lockname.len, @@ -866,7 +942,7 @@ wait: dlm->name, res->lockname.len, res->lockname.name, blocked); dlm_print_one_lock_resource(res); - /* dlm_print_one_mle(mle); */ + dlm_print_one_mle(mle); tries = 0; } goto redo_request; @@ -880,7 +956,7 @@ wait: dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); /* put the extra ref */ - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); wake_waiters: spin_lock(&res->spinlock); @@ -921,12 +997,14 @@ recheck: spin_unlock(&res->spinlock); /* this will cause the master to re-assert across * the whole cluster, freeing up mles */ - ret = dlm_do_master_request(mle, res->owner); - if (ret < 0) { - /* give recovery a chance to run */ - mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); - msleep(500); - goto recheck; + if (res->owner != dlm->node_num) { + ret = dlm_do_master_request(mle, res->owner); + if (ret < 0) { + /* give recovery a chance to run */ + mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); + msleep(500); + goto recheck; + } } ret = 0; goto leave; @@ -962,6 +1040,12 @@ recheck: "rechecking now\n", dlm->name, res->lockname.len, res->lockname.name); goto recheck; + } else { + if (!voting_done) { + mlog(0, "map not changed and voting not done " + "for %s:%.*s\n", dlm->name, res->lockname.len, + res->lockname.name); + } } if (m != O2NM_MAX_NODES) { @@ -1129,18 +1213,6 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, set_bit(node, mle->vote_map); } else { mlog(ML_ERROR, "node down! %d\n", node); - - /* if the node wasn't involved in mastery skip it, - * but clear it out from the maps so that it will - * not affect mastery of this lockres */ - clear_bit(node, mle->response_map); - clear_bit(node, mle->vote_map); - if (!test_bit(node, mle->maybe_map)) - goto next; - - /* if we're already blocked on lock mastery, and the - * dead node wasn't the expected master, or there is - * another node in the maybe_map, keep waiting */ if (blocked) { int lowest = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); @@ -1148,54 +1220,53 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, /* act like it was never there */ clear_bit(node, mle->maybe_map); - if (node != lowest) - goto next; - - mlog(ML_ERROR, "expected master %u died while " - "this node was blocked waiting on it!\n", - node); - lowest = find_next_bit(mle->maybe_map, - O2NM_MAX_NODES, - lowest+1); - if (lowest < O2NM_MAX_NODES) { - mlog(0, "still blocked. waiting " - "on %u now\n", lowest); - goto next; + if (node == lowest) { + mlog(0, "expected master %u died" + " while this node was blocked " + "waiting on it!\n", node); + lowest = find_next_bit(mle->maybe_map, + O2NM_MAX_NODES, + lowest+1); + if (lowest < O2NM_MAX_NODES) { + mlog(0, "%s:%.*s:still " + "blocked. waiting on %u " + "now\n", dlm->name, + res->lockname.len, + res->lockname.name, + lowest); + } else { + /* mle is an MLE_BLOCK, but + * there is now nothing left to + * block on. we need to return + * all the way back out and try + * again with an MLE_MASTER. + * dlm_do_local_recovery_cleanup + * has already run, so the mle + * refcount is ok */ + mlog(0, "%s:%.*s: no " + "longer blocking. try to " + "master this here\n", + dlm->name, + res->lockname.len, + res->lockname.name); + mle->type = DLM_MLE_MASTER; + mle->u.res = res; + } } - - /* mle is an MLE_BLOCK, but there is now - * nothing left to block on. we need to return - * all the way back out and try again with - * an MLE_MASTER. dlm_do_local_recovery_cleanup - * has already run, so the mle refcount is ok */ - mlog(0, "no longer blocking. we can " - "try to master this here\n"); - mle->type = DLM_MLE_MASTER; - memset(mle->maybe_map, 0, - sizeof(mle->maybe_map)); - memset(mle->response_map, 0, - sizeof(mle->maybe_map)); - memcpy(mle->vote_map, mle->node_map, - sizeof(mle->node_map)); - mle->u.res = res; - set_bit(dlm->node_num, mle->maybe_map); - - ret = -EAGAIN; - goto next; } - clear_bit(node, mle->maybe_map); - if (node > dlm->node_num) - goto next; - - mlog(0, "dead node in map!\n"); - /* yuck. go back and re-contact all nodes - * in the vote_map, removing this node. */ - memset(mle->response_map, 0, - sizeof(mle->response_map)); + /* now blank out everything, as if we had never + * contacted anyone */ + memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + memset(mle->response_map, 0, sizeof(mle->response_map)); + /* reset the vote_map to the current node_map */ + memcpy(mle->vote_map, mle->node_map, + sizeof(mle->node_map)); + /* put myself into the maybe map */ + if (mle->type != DLM_MLE_BLOCK) + set_bit(dlm->node_num, mle->maybe_map); } ret = -EAGAIN; -next: node = dlm_bitmap_diff_iter_next(&bdi, &sc); } return ret; @@ -1316,7 +1387,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; char *name; - unsigned int namelen; + unsigned int namelen, hash; int found, ret; int set_maybe; int dispatch_assert = 0; @@ -1331,6 +1402,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) name = request->name; namelen = request->namelen; + hash = dlm_lockid_hash(name, namelen); if (namelen > DLM_LOCKID_NAME_MAX) { response = DLM_IVBUFLEN; @@ -1339,7 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) way_up_top: spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); if (res) { spin_unlock(&dlm->spinlock); @@ -1459,21 +1531,18 @@ way_up_top: spin_unlock(&dlm->spinlock); mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); + kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!mle) { response = DLM_MASTER_RESP_ERROR; mlog_errno(-ENOMEM); goto send_response; } - spin_lock(&dlm->spinlock); - dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, - name, namelen); - spin_unlock(&dlm->spinlock); goto way_up_top; } // mlog(0, "this is second time thru, already allocated, " // "add the block.\n"); + dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); set_bit(request->node_idx, mle->maybe_map); list_add(&mle->list, &dlm->master_list); response = DLM_MASTER_RESP_NO; @@ -1556,6 +1625,8 @@ again: dlm_node_iter_init(nodemap, &iter); while ((to = dlm_node_iter_next(&iter)) >= 0) { int r = 0; + struct dlm_master_list_entry *mle = NULL; + mlog(0, "sending assert master to %d (%.*s)\n", to, namelen, lockname); memset(&assert, 0, sizeof(assert)); @@ -1567,20 +1638,28 @@ again: tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, &assert, sizeof(assert), to, &r); if (tmpret < 0) { - mlog(ML_ERROR, "assert_master returned %d!\n", tmpret); + mlog(0, "assert_master returned %d!\n", tmpret); if (!dlm_is_host_down(tmpret)) { - mlog(ML_ERROR, "unhandled error!\n"); + mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); BUG(); } /* a node died. finish out the rest of the nodes. */ - mlog(ML_ERROR, "link to %d went down!\n", to); + mlog(0, "link to %d went down!\n", to); /* any nonzero status return will do */ ret = tmpret; } else if (r < 0) { /* ok, something horribly messed. kill thyself. */ mlog(ML_ERROR,"during assert master of %.*s to %u, " "got %d.\n", namelen, lockname, to, r); - dlm_dump_lock_resources(dlm); + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + if (dlm_find_mle(dlm, &mle, (char *)lockname, + namelen)) { + dlm_print_one_mle(mle); + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); BUG(); } else if (r == EAGAIN) { mlog(0, "%.*s: node %u create mles on other " @@ -1612,7 +1691,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; struct dlm_lock_resource *res = NULL; char *name; - unsigned int namelen; + unsigned int namelen, hash; u32 flags; int master_request = 0; int ret = 0; @@ -1622,6 +1701,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) name = assert->name; namelen = assert->namelen; + hash = dlm_lockid_hash(name, namelen); flags = be32_to_cpu(assert->flags); if (namelen > DLM_LOCKID_NAME_MAX) { @@ -1646,7 +1726,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) if (bit >= O2NM_MAX_NODES) { /* not necessarily an error, though less likely. * could be master just re-asserting. */ - mlog(ML_ERROR, "no bits set in the maybe_map, but %u " + mlog(0, "no bits set in the maybe_map, but %u " "is asserting! (%.*s)\n", assert->node_idx, namelen, name); } else if (bit != assert->node_idx) { @@ -1658,19 +1738,36 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) * number winning the mastery will respond * YES to mastery requests, but this node * had no way of knowing. let it pass. */ - mlog(ML_ERROR, "%u is the lowest node, " + mlog(0, "%u is the lowest node, " "%u is asserting. (%.*s) %u must " "have begun after %u won.\n", bit, assert->node_idx, namelen, name, bit, assert->node_idx); } } + if (mle->type == DLM_MLE_MIGRATION) { + if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { + mlog(0, "%s:%.*s: got cleanup assert" + " from %u for migration\n", + dlm->name, namelen, name, + assert->node_idx); + } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { + mlog(0, "%s:%.*s: got unrelated assert" + " from %u for migration, ignoring\n", + dlm->name, namelen, name, + assert->node_idx); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + goto done; + } + } } spin_unlock(&dlm->master_lock); /* ok everything checks out with the MLE * now check to see if there is a lockres */ - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); if (res) { spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { @@ -1679,7 +1776,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) goto kill; } if (!mle) { - if (res->owner != assert->node_idx) { + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && + res->owner != assert->node_idx) { mlog(ML_ERROR, "assert_master from " "%u, but current owner is " "%u! (%.*s)\n", @@ -1732,6 +1830,7 @@ ok: if (mle) { int extra_ref = 0; int nn = -1; + int rr, err = 0; spin_lock(&mle->spinlock); if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) @@ -1751,27 +1850,64 @@ ok: wake_up(&mle->wq); spin_unlock(&mle->spinlock); - if (mle->type == DLM_MLE_MIGRATION && res) { - mlog(0, "finishing off migration of lockres %.*s, " - "from %u to %u\n", - res->lockname.len, res->lockname.name, - dlm->node_num, mle->new_master); + if (res) { spin_lock(&res->spinlock); - res->state &= ~DLM_LOCK_RES_MIGRATING; - dlm_change_lockres_owner(dlm, res, mle->new_master); - BUG_ON(res->state & DLM_LOCK_RES_DIRTY); + if (mle->type == DLM_MLE_MIGRATION) { + mlog(0, "finishing off migration of lockres %.*s, " + "from %u to %u\n", + res->lockname.len, res->lockname.name, + dlm->node_num, mle->new_master); + res->state &= ~DLM_LOCK_RES_MIGRATING; + dlm_change_lockres_owner(dlm, res, mle->new_master); + BUG_ON(res->state & DLM_LOCK_RES_DIRTY); + } else { + dlm_change_lockres_owner(dlm, res, mle->master); + } spin_unlock(&res->spinlock); } - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, mle); - dlm_put_mle(mle); - + + /* master is known, detach if not already detached. + * ensures that only one assert_master call will happen + * on this mle. */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + + rr = atomic_read(&mle->mle_refs.refcount); + if (mle->inuse > 0) { + if (extra_ref && rr < 3) + err = 1; + else if (!extra_ref && rr < 2) + err = 1; + } else { + if (extra_ref && rr < 2) + err = 1; + else if (!extra_ref && rr < 1) + err = 1; + } + if (err) { + mlog(ML_ERROR, "%s:%.*s: got assert master from %u " + "that will mess up this node, refs=%d, extra=%d, " + "inuse=%d\n", dlm->name, namelen, name, + assert->node_idx, rr, extra_ref, mle->inuse); + dlm_print_one_mle(mle); + } + list_del_init(&mle->list); + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); if (extra_ref) { /* the assert master message now balances the extra * ref given by the master / migration request message. * if this is the last put, it will be removed * from the list. */ - dlm_put_mle(mle); + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + } else if (res) { + if (res->owner != assert->node_idx) { + mlog(0, "assert_master from %u, but current " + "owner is %u (%.*s), no mle\n", assert->node_idx, + res->owner, namelen, name); } } @@ -1788,12 +1924,12 @@ done: kill: /* kill the caller! */ + mlog(ML_ERROR, "Bad message received from another node. Dumping state " + "and killing the other node now! This node is OK and can continue.\n"); + __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); dlm_lockres_put(res); - mlog(ML_ERROR, "Bad message received from another node. Dumping state " - "and killing the other node now! This node is OK and can continue.\n"); - dlm_dump_lock_resources(dlm); dlm_put(dlm); return -EINVAL; } @@ -1803,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, int ignore_higher, u8 request_from, u32 flags) { struct dlm_work_item *item; - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!item) return -ENOMEM; @@ -1825,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); return 0; } @@ -1866,6 +2002,23 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) } } + /* + * If we're migrating this lock to someone else, we are no + * longer allowed to assert out own mastery. OTOH, we need to + * prevent migration from starting while we're still asserting + * our dominance. The reserved ast delays migration. + */ + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_MIGRATING) { + mlog(0, "Someone asked us to assert mastery, but we're " + "in the middle of migration. Skipping assert, " + "the new master will handle that.\n"); + spin_unlock(&res->spinlock); + goto put; + } else + __dlm_lockres_reserve_ast(res); + spin_unlock(&res->spinlock); + /* this call now finishes out the nodemap * even if one or more nodes die */ mlog(0, "worker about to master %.*s here, this=%u\n", @@ -1875,9 +2028,14 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) nodemap, flags); if (ret < 0) { /* no need to restart, we are done */ - mlog_errno(ret); + if (!dlm_is_host_down(ret)) + mlog_errno(ret); } + /* Ok, we've asserted ourselves. Let's let migration start. */ + dlm_lockres_release_ast(dlm, res); + +put: dlm_lockres_put(res); mlog(0, "finished with dlm_assert_master_worker\n"); @@ -1916,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, BUG(); /* host is down, so answer for that node would be * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ + ret = 0; } if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { @@ -2016,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, */ ret = -ENOMEM; - mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL); + mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); if (!mres) { mlog_errno(ret); goto leave; } mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_KERNEL); + GFP_NOFS); if (!mle) { mlog_errno(ret); goto leave; @@ -2117,7 +2276,7 @@ fail: * take both dlm->spinlock and dlm->master_lock */ spin_lock(&dlm->spinlock); spin_lock(&dlm->master_lock); - dlm_get_mle(mle); + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -2134,7 +2293,10 @@ fail: /* migration failed, detach and clean up mle */ dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); goto leave; } @@ -2164,8 +2326,8 @@ fail: /* avoid hang during shutdown when migrating lockres * to a node which also goes down */ if (dlm_is_node_dead(dlm, target)) { - mlog(0, "%s:%.*s: expected migration target %u " - "is no longer up. restarting.\n", + mlog(0, "%s:%.*s: expected migration " + "target %u is no longer up, restarting\n", dlm->name, res->lockname.len, res->lockname.name, target); ret = -ERESTARTSYS; @@ -2175,7 +2337,10 @@ fail: /* migration failed, detach and clean up mle */ dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); goto leave; } /* TODO: if node died: stop, clean up, return error */ @@ -2191,7 +2356,7 @@ fail: /* master is known, detach if not already detached */ dlm_mle_detach_hb_events(dlm, mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); ret = 0; dlm_lockres_calc_usage(dlm, res); @@ -2462,7 +2627,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; const char *name; - unsigned int namelen; + unsigned int namelen, hash; int ret = 0; if (!dlm_grab(dlm)) @@ -2470,10 +2635,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) name = migrate->name; namelen = migrate->namelen; + hash = dlm_lockid_hash(name, namelen); /* preallocate.. if this fails, abort */ mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_KERNEL); + GFP_NOFS); if (!mle) { ret = -ENOMEM; @@ -2482,7 +2648,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) /* check for pre-existing lock */ spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); spin_lock(&dlm->master_lock); if (res) { @@ -2580,6 +2746,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it from the list so that only one * mle will be found */ list_del_init(&tmp->list); + __dlm_mle_detach_hb_events(dlm, mle); } spin_unlock(&tmp->spinlock); } @@ -2601,6 +2768,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) struct list_head *iter, *iter2; struct dlm_master_list_entry *mle; struct dlm_lock_resource *res; + unsigned int hash; mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); top: @@ -2640,7 +2808,7 @@ top: * may result in the mle being unlinked and * freed, but there may still be a process * waiting in the dlmlock path which is fine. */ - mlog(ML_ERROR, "node %u was expected master\n", + mlog(0, "node %u was expected master\n", dead_node); atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); @@ -2673,19 +2841,21 @@ top: /* remove from the list early. NOTE: unlinking * list_head while in list_for_each_safe */ + __dlm_mle_detach_hb_events(dlm, mle); spin_lock(&mle->spinlock); list_del_init(&mle->list); atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); wake_up(&mle->wq); - mlog(0, "node %u died during migration from " - "%u to %u!\n", dead_node, + mlog(0, "%s: node %u died during migration from " + "%u to %u!\n", dlm->name, dead_node, mle->master, mle->new_master); /* if there is a lockres associated with this * mle, find it and set its owner to UNKNOWN */ + hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len); res = __dlm_lookup_lockres(dlm, mle->u.name.name, - mle->u.name.len); + mle->u.name.len, hash); if (res) { /* unfortunately if we hit this rare case, our * lock ordering is messed. we need to drop diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 805cbabac051..29b2845f370d 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -98,8 +98,8 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data); static u64 dlm_get_next_mig_cookie(void); -static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED; -static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(dlm_reco_state_lock); +static DEFINE_SPINLOCK(dlm_mig_cookie_lock); static u64 dlm_mig_cookie = 1; static u64 dlm_get_next_mig_cookie(void) @@ -115,12 +115,37 @@ static u64 dlm_get_next_mig_cookie(void) return c; } +static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm, + u8 dead_node) +{ + assert_spin_locked(&dlm->spinlock); + if (dlm->reco.dead_node != dead_node) + mlog(0, "%s: changing dead_node from %u to %u\n", + dlm->name, dlm->reco.dead_node, dead_node); + dlm->reco.dead_node = dead_node; +} + +static inline void dlm_set_reco_master(struct dlm_ctxt *dlm, + u8 master) +{ + assert_spin_locked(&dlm->spinlock); + mlog(0, "%s: changing new_master from %u to %u\n", + dlm->name, dlm->reco.new_master, master); + dlm->reco.new_master = master; +} + +static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm) +{ + assert_spin_locked(&dlm->spinlock); + clear_bit(dlm->reco.dead_node, dlm->recovery_map); + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); +} + static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) { spin_lock(&dlm->spinlock); - clear_bit(dlm->reco.dead_node, dlm->recovery_map); - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; - dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + __dlm_reset_recovery(dlm); spin_unlock(&dlm->spinlock); } @@ -132,12 +157,21 @@ void dlm_dispatch_work(void *data) struct list_head *iter, *iter2; struct dlm_work_item *item; dlm_workfunc_t *workfunc; + int tot=0; + + if (!dlm_joined(dlm)) + return; spin_lock(&dlm->work_lock); list_splice_init(&dlm->work_list, &tmp_list); spin_unlock(&dlm->work_lock); list_for_each_safe(iter, iter2, &tmp_list) { + tot++; + } + mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); + + list_for_each_safe(iter, iter2, &tmp_list) { item = list_entry(iter, struct dlm_work_item, list); workfunc = item->func; list_del_init(&item->list); @@ -220,6 +254,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) * */ +static void dlm_print_reco_node_status(struct dlm_ctxt *dlm) +{ + struct dlm_reco_node_data *ndata; + struct dlm_lock_resource *res; + + mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, + dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive", + dlm->reco.dead_node, dlm->reco.new_master); + + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + char *st = "unknown"; + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + st = "init"; + break; + case DLM_RECO_NODE_DATA_REQUESTING: + st = "requesting"; + break; + case DLM_RECO_NODE_DATA_DEAD: + st = "dead"; + break; + case DLM_RECO_NODE_DATA_RECEIVING: + st = "receiving"; + break; + case DLM_RECO_NODE_DATA_REQUESTED: + st = "requested"; + break; + case DLM_RECO_NODE_DATA_DONE: + st = "done"; + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + st = "finalize-sent"; + break; + default: + st = "bad"; + break; + } + mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n", + dlm->name, ndata->node_num, st); + } + list_for_each_entry(res, &dlm->reco.resources, recovering) { + mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n", + dlm->name, res->lockname.len, res->lockname.name); + } +} #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) @@ -267,11 +347,23 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) { int dead; spin_lock(&dlm->spinlock); - dead = test_bit(node, dlm->domain_map); + dead = !test_bit(node, dlm->domain_map); spin_unlock(&dlm->spinlock); return dead; } +/* returns true if node is no longer in the domain + * could be dead or just not joined */ +static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) +{ + int recovered; + spin_lock(&dlm->spinlock); + recovered = !test_bit(node, dlm->recovery_map); + spin_unlock(&dlm->spinlock); + return recovered; +} + + int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) { if (timeout) { @@ -290,6 +382,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) return 0; } +int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (timeout) { + mlog(0, "%s: waiting %dms for notification of " + "recovery of node %u\n", dlm->name, timeout, node); + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node), + msecs_to_jiffies(timeout)); + } else { + mlog(0, "%s: waiting indefinitely for notification " + "of recovery of node %u\n", dlm->name, node); + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node)); + } + /* for now, return 0 */ + return 0; +} + /* callers of the top-level api calls (dlmlock/dlmunlock) should * block on the dlm->reco.event when recovery is in progress. * the dlm recovery thread will set this state when it begins @@ -308,6 +418,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm) void dlm_wait_for_recovery(struct dlm_ctxt *dlm) { + if (dlm_in_recovery(dlm)) { + mlog(0, "%s: reco thread %d in recovery: " + "state=%d, master=%u, dead=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, + dlm->reco.state, dlm->reco.new_master, + dlm->reco.dead_node); + } wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); } @@ -341,7 +458,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) mlog(0, "new master %u died while recovering %u!\n", dlm->reco.new_master, dlm->reco.dead_node); /* unset the new_master, leave dead_node */ - dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); } /* select a target to recover */ @@ -350,14 +467,14 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); if (bit >= O2NM_MAX_NODES || bit < 0) - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else - dlm->reco.dead_node = bit; + dlm_set_reco_dead_node(dlm, bit); } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { /* BUG? */ mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", dlm->reco.dead_node); - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); } if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { @@ -366,7 +483,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) /* return to main thread loop and sleep. */ return 0; } - mlog(0, "recovery thread found node %u in the recovery map!\n", + mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", + dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.dead_node); spin_unlock(&dlm->spinlock); @@ -389,8 +507,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) } mlog(0, "another node will master this recovery session.\n"); } - mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n", - dlm->name, dlm->reco.new_master, + mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master, dlm->node_num, dlm->reco.dead_node); /* it is safe to start everything back up here @@ -402,11 +520,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) return 0; master_here: - mlog(0, "mastering recovery of %s:%u here(this=%u)!\n", + mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n", + dlm->dlm_reco_thread_task->pid, dlm->name, dlm->reco.dead_node, dlm->node_num); status = dlm_remaster_locks(dlm, dlm->reco.dead_node); if (status < 0) { + /* we should never hit this anymore */ mlog(ML_ERROR, "error %d remastering locks for node %u, " "retrying.\n", status, dlm->reco.dead_node); /* yield a bit to allow any final network messages @@ -433,9 +553,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) int destroy = 0; int pass = 0; - status = dlm_init_recovery_area(dlm, dead_node); - if (status < 0) - goto leave; + do { + /* we have become recovery master. there is no escaping + * this, so just keep trying until we get it. */ + status = dlm_init_recovery_area(dlm, dead_node); + if (status < 0) { + mlog(ML_ERROR, "%s: failed to alloc recovery area, " + "retrying\n", dlm->name); + msleep(1000); + } + } while (status != 0); /* safe to access the node data list without a lock, since this * process is the only one to change the list */ @@ -452,16 +579,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) continue; } - status = dlm_request_all_locks(dlm, ndata->node_num, dead_node); - if (status < 0) { - mlog_errno(status); - if (dlm_is_host_down(status)) - ndata->state = DLM_RECO_NODE_DATA_DEAD; - else { - destroy = 1; - goto leave; + do { + status = dlm_request_all_locks(dlm, ndata->node_num, + dead_node); + if (status < 0) { + mlog_errno(status); + if (dlm_is_host_down(status)) { + /* node died, ignore it for recovery */ + status = 0; + ndata->state = DLM_RECO_NODE_DATA_DEAD; + /* wait for the domain map to catch up + * with the network state. */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, + ndata->node_num), + msecs_to_jiffies(1000)); + mlog(0, "waited 1 sec for %u, " + "dead? %s\n", ndata->node_num, + dlm_is_node_dead(dlm, ndata->node_num) ? + "yes" : "no"); + } else { + /* -ENOMEM on the other node */ + mlog(0, "%s: node %u returned " + "%d during recovery, retrying " + "after a short wait\n", + dlm->name, ndata->node_num, + status); + msleep(100); + } } - } + } while (status != 0); switch (ndata->state) { case DLM_RECO_NODE_DATA_INIT: @@ -473,10 +620,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) mlog(0, "node %u died after requesting " "recovery info for node %u\n", ndata->node_num, dead_node); - // start all over - destroy = 1; - status = -EAGAIN; - goto leave; + /* fine. don't need this node's info. + * continue without it. */ + break; case DLM_RECO_NODE_DATA_REQUESTING: ndata->state = DLM_RECO_NODE_DATA_REQUESTED; mlog(0, "now receiving recovery data from " @@ -520,35 +666,26 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) BUG(); break; case DLM_RECO_NODE_DATA_DEAD: - mlog(ML_NOTICE, "node %u died after " + mlog(0, "node %u died after " "requesting recovery info for " "node %u\n", ndata->node_num, dead_node); - spin_unlock(&dlm_reco_state_lock); - // start all over - destroy = 1; - status = -EAGAIN; - /* instead of spinning like crazy here, - * wait for the domain map to catch up - * with the network state. otherwise this - * can be hit hundreds of times before - * the node is really seen as dead. */ - wait_event_timeout(dlm->dlm_reco_thread_wq, - dlm_is_node_dead(dlm, - ndata->node_num), - msecs_to_jiffies(1000)); - mlog(0, "waited 1 sec for %u, " - "dead? %s\n", ndata->node_num, - dlm_is_node_dead(dlm, ndata->node_num) ? - "yes" : "no"); - goto leave; + break; case DLM_RECO_NODE_DATA_RECEIVING: case DLM_RECO_NODE_DATA_REQUESTED: + mlog(0, "%s: node %u still in state %s\n", + dlm->name, ndata->node_num, + ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? + "receiving" : "requested"); all_nodes_done = 0; break; case DLM_RECO_NODE_DATA_DONE: + mlog(0, "%s: node %u state is done\n", + dlm->name, ndata->node_num); break; case DLM_RECO_NODE_DATA_FINALIZE_SENT: + mlog(0, "%s: node %u state is finalize\n", + dlm->name, ndata->node_num); break; } } @@ -578,7 +715,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) jiffies, dlm->reco.dead_node, dlm->node_num, dlm->reco.new_master); destroy = 1; - status = ret; + status = 0; /* rescan everything marked dirty along the way */ dlm_kick_thread(dlm, NULL); break; @@ -591,7 +728,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) } -leave: if (destroy) dlm_destroy_recovery_area(dlm, dead_node); @@ -617,7 +753,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) } BUG_ON(num == dead_node); - ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL); + ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS); if (!ndata) { dlm_destroy_recovery_area(dlm, dead_node); return -ENOMEM; @@ -691,16 +827,25 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) if (!dlm_grab(dlm)) return -EINVAL; + if (lr->dead_node != dlm->reco.dead_node) { + mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local " + "dead_node is %u\n", dlm->name, lr->node_idx, + lr->dead_node, dlm->reco.dead_node); + dlm_print_reco_node_status(dlm); + /* this is a hack */ + dlm_put(dlm); + return -ENOMEM; + } BUG_ON(lr->dead_node != dlm->reco.dead_node); - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!item) { dlm_put(dlm); return -ENOMEM; } /* this will get freed by dlm_request_all_locks_worker */ - buf = (char *) __get_free_page(GFP_KERNEL); + buf = (char *) __get_free_page(GFP_NOFS); if (!buf) { kfree(item); dlm_put(dlm); @@ -715,7 +860,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm->work_lock); list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); dlm_put(dlm); return 0; @@ -730,32 +875,34 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) struct list_head *iter; int ret; u8 dead_node, reco_master; + int skip_all_done = 0; dlm = item->dlm; dead_node = item->u.ral.dead_node; reco_master = item->u.ral.reco_master; mres = (struct dlm_migratable_lockres *)data; + mlog(0, "%s: recovery worker started, dead=%u, master=%u\n", + dlm->name, dead_node, reco_master); + if (dead_node != dlm->reco.dead_node || reco_master != dlm->reco.new_master) { - /* show extra debug info if the recovery state is messed */ - mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), " - "request(dead=%u, master=%u)\n", - dlm->name, dlm->reco.dead_node, dlm->reco.new_master, - dead_node, reco_master); - mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u " - "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n", - dlm->name, mres->lockname_len, mres->lockname, mres->master, - mres->num_locks, mres->total_locks, mres->flags, - dlm_get_lock_cookie_node(mres->ml[0].cookie), - dlm_get_lock_cookie_seq(mres->ml[0].cookie), - mres->ml[0].list, mres->ml[0].flags, - mres->ml[0].type, mres->ml[0].convert_type, - mres->ml[0].highest_blocked, mres->ml[0].node); - BUG(); + /* worker could have been created before the recovery master + * died. if so, do not continue, but do not error. */ + if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { + mlog(ML_NOTICE, "%s: will not send recovery state, " + "recovery master %u died, thread=(dead=%u,mas=%u)" + " current=(dead=%u,mas=%u)\n", dlm->name, + reco_master, dead_node, reco_master, + dlm->reco.dead_node, dlm->reco.new_master); + } else { + mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, " + "master=%u), request(dead=%u, master=%u)\n", + dlm->name, dlm->reco.dead_node, + dlm->reco.new_master, dead_node, reco_master); + } + goto leave; } - BUG_ON(dead_node != dlm->reco.dead_node); - BUG_ON(reco_master != dlm->reco.new_master); /* lock resources should have already been moved to the * dlm->reco.resources list. now move items from that list @@ -766,12 +913,20 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) dlm_move_reco_locks_to_list(dlm, &resources, dead_node); /* now we can begin blasting lockreses without the dlm lock */ + + /* any errors returned will be due to the new_master dying, + * the dlm_reco_thread should detect this */ list_for_each(iter, &resources) { res = list_entry (iter, struct dlm_lock_resource, recovering); ret = dlm_send_one_lockres(dlm, res, mres, reco_master, DLM_MRES_RECOVERY); - if (ret < 0) - mlog_errno(ret); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery state for dead node %u, ret=%d\n", dlm->name, + reco_master, dead_node, ret); + skip_all_done = 1; + break; + } } /* move the resources back to the list */ @@ -779,10 +934,15 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) list_splice_init(&resources, &dlm->reco.resources); spin_unlock(&dlm->spinlock); - ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); - if (ret < 0) - mlog_errno(ret); - + if (!skip_all_done) { + ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery all-done for dead node %u, ret=%d\n", + dlm->name, reco_master, dead_node, ret); + } + } +leave: free_page((unsigned long)data); } @@ -801,8 +961,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, sizeof(done_msg), send_to, &tmpret); - /* negative status is ignored by the caller */ - if (ret >= 0) + if (ret < 0) { + if (!dlm_is_host_down(ret)) { + mlog_errno(ret); + mlog(ML_ERROR, "%s: unknown error sending data-done " + "to %u\n", dlm->name, send_to); + BUG(); + } + } else ret = tmpret; return ret; } @@ -822,7 +988,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " "node_idx=%u, this node=%u\n", done->dead_node, dlm->reco.dead_node, done->node_idx, dlm->node_num); - BUG_ON(done->dead_node != dlm->reco.dead_node); + + mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node), + "Got DATA DONE: dead_node=%u, reco.dead_node=%u, " + "node_idx=%u, this node=%u\n", done->dead_node, + dlm->reco.dead_node, done->node_idx, dlm->node_num); spin_lock(&dlm_reco_state_lock); list_for_each(iter, &dlm->reco.node_data) { @@ -905,13 +1075,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, mlog(0, "found lockres owned by dead node while " "doing recovery for node %u. sending it.\n", dead_node); - list_del_init(&res->recovering); - list_add_tail(&res->recovering, list); + list_move_tail(&res->recovering, list); } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { mlog(0, "found UNKNOWN owner while doing recovery " "for node %u. sending it.\n", dead_node); - list_del_init(&res->recovering); - list_add_tail(&res->recovering, list); + list_move_tail(&res->recovering, list); } } spin_unlock(&dlm->spinlock); @@ -1023,8 +1191,9 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock, ml->type == LKM_PRMODE) { /* if it is already set, this had better be a PR * and it has to match */ - if (mres->lvb[0] && (ml->type == LKM_EXMODE || - memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { + if (!dlm_lvb_is_empty(mres->lvb) && + (ml->type == LKM_EXMODE || + memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { mlog(ML_ERROR, "mismatched lvbs!\n"); __dlm_print_one_lock_resource(lock->lockres); BUG(); @@ -1083,22 +1252,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, * we must send it immediately. */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); - if (ret < 0) { - // TODO - mlog(ML_ERROR, "dlm_send_mig_lockres_msg " - "returned %d, TODO\n", ret); - BUG(); - } + if (ret < 0) + goto error; } } /* flush any remaining locks */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); - if (ret < 0) { - // TODO - mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " - "TODO\n", ret); + if (ret < 0) + goto error; + return ret; + +error: + mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n", + dlm->name, ret); + if (!dlm_is_host_down(ret)) BUG(); - } + mlog(0, "%s: node %u went down while sending %s " + "lockres %.*s\n", dlm->name, send_to, + flags & DLM_MRES_RECOVERY ? "recovery" : "migration", + res->lockname.len, res->lockname.name); return ret; } @@ -1146,8 +1318,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "all done flag. all lockres data received!\n"); ret = -ENOMEM; - buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL); - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!buf || !item) goto leave; @@ -1238,7 +1410,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm->work_lock); list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); leave: dlm_put(dlm); @@ -1406,6 +1578,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_ctxt *dlm = data; struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; struct dlm_lock_resource *res = NULL; + unsigned int hash; int master = DLM_LOCK_RES_OWNER_UNKNOWN; u32 flags = DLM_ASSERT_MASTER_REQUERY; @@ -1415,8 +1588,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) return master; } + hash = dlm_lockid_hash(req->name, req->namelen); + spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, req->name, req->namelen); + res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash); if (res) { spin_lock(&res->spinlock); master = res->owner; @@ -1483,7 +1658,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; - int i; + int i, bad; struct list_head *iter; struct dlm_lock *lock = NULL; @@ -1529,8 +1704,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* move the lock to its proper place */ /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, queue); + list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); mlog(0, "just reordered a local lock!\n"); @@ -1553,28 +1727,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, } lksb->flags |= (ml->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); - - if (mres->lvb[0]) { + + if (ml->type == LKM_NLMODE) + goto skip_lvb; + + if (!dlm_lvb_is_empty(mres->lvb)) { if (lksb->flags & DLM_LKSB_PUT_LVB) { /* other node was trying to update * lvb when node died. recreate the * lksb with the updated lvb. */ memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); + /* the lock resource lvb update must happen + * NOW, before the spinlock is dropped. + * we no longer wait for the AST to update + * the lvb. */ + memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); } else { /* otherwise, the node is sending its * most recent valid lvb info */ BUG_ON(ml->type != LKM_EXMODE && ml->type != LKM_PRMODE); - if (res->lvb[0] && (ml->type == LKM_EXMODE || - memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { - mlog(ML_ERROR, "received bad lvb!\n"); - __dlm_print_one_lock_resource(res); - BUG(); + if (!dlm_lvb_is_empty(res->lvb) && + (ml->type == LKM_EXMODE || + memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { + int i; + mlog(ML_ERROR, "%s:%.*s: received bad " + "lvb! type=%d\n", dlm->name, + res->lockname.len, + res->lockname.name, ml->type); + printk("lockres lvb=["); + for (i=0; i<DLM_LVB_LEN; i++) + printk("%02x", res->lvb[i]); + printk("]\nmigrated lvb=["); + for (i=0; i<DLM_LVB_LEN; i++) + printk("%02x", mres->lvb[i]); + printk("]\n"); + dlm_print_one_lock_resource(res); + BUG(); } memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); } } - +skip_lvb: /* NOTE: * wrt lock queue ordering and recovery: @@ -1592,9 +1786,33 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, * relative to each other, but clearly *not* * preserved relative to locks from other nodes. */ + bad = 0; spin_lock(&res->spinlock); - dlm_lock_get(newlock); - list_add_tail(&newlock->list, queue); + list_for_each_entry(lock, queue, list) { + if (lock->ml.cookie == ml->cookie) { + u64 c = lock->ml.cookie; + mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " + "exists on this lockres!\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(c), + dlm_get_lock_cookie_seq(c)); + + mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " + "node=%u, cookie=%u:%llu, queue=%d\n", + ml->type, ml->convert_type, ml->node, + dlm_get_lock_cookie_node(ml->cookie), + dlm_get_lock_cookie_seq(ml->cookie), + ml->list); + + __dlm_print_one_lock_resource(res); + bad = 1; + break; + } + } + if (!bad) { + dlm_lock_get(newlock); + list_add_tail(&newlock->list, queue); + } spin_unlock(&res->spinlock); } mlog(0, "done running all the locks\n"); @@ -1618,8 +1836,14 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, struct dlm_lock *lock; res->state |= DLM_LOCK_RES_RECOVERING; - if (!list_empty(&res->recovering)) + if (!list_empty(&res->recovering)) { + mlog(0, + "Recovering res %s:%.*s, is already on recovery list!\n", + dlm->name, res->lockname.len, res->lockname.name); list_del_init(&res->recovering); + } + /* We need to hold a reference while on the recovery list */ + dlm_lockres_get(res); list_add_tail(&res->recovering, &dlm->reco.resources); /* find any pending locks and put them back on proper list */ @@ -1708,9 +1932,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - __dlm_dirty_lockres(dlm, res); + if (!__dlm_lockres_unused(res)) + __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); + dlm_lockres_put(res); } } @@ -1719,7 +1945,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, * the RECOVERING state and set the owner * if necessary */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { - bucket = &(dlm->lockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, hash_iter, bucket, hash_node) { if (res->state & DLM_LOCK_RES_RECOVERING) { if (res->owner == dead_node) { @@ -1743,11 +1969,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, dlm->name, res->lockname.len, res->lockname.name, res->owner); list_del_init(&res->recovering); + dlm_lockres_put(res); } spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - __dlm_dirty_lockres(dlm, res); + if (!__dlm_lockres_unused(res)) + __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); } @@ -1884,7 +2112,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) * need to be fired as a result. */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { - bucket = &(dlm->lockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, iter, bucket, hash_node) { /* always prune any $RECOVERY entries for dead nodes, * otherwise hangs can occur during later recovery */ @@ -1924,6 +2152,20 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) { assert_spin_locked(&dlm->spinlock); + if (dlm->reco.new_master == idx) { + mlog(0, "%s: recovery master %d just died\n", + dlm->name, idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + /* finalize1 was reached, so it is safe to clear + * the new_master and dead_node. that recovery + * is complete. */ + mlog(0, "%s: dead master %d had reached " + "finalize1 state, clearing\n", dlm->name, idx); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); + } + } + /* check to see if the node is already considered dead */ if (!test_bit(idx, dlm->live_nodes_map)) { mlog(0, "for domain %s, node %d is already dead. " @@ -2087,7 +2329,7 @@ again: /* set the new_master to this node */ spin_lock(&dlm->spinlock); - dlm->reco.new_master = dlm->node_num; + dlm_set_reco_master(dlm, dlm->node_num); spin_unlock(&dlm->spinlock); } @@ -2125,6 +2367,10 @@ again: mlog(0, "%s: reco master %u is ready to recover %u\n", dlm->name, dlm->reco.new_master, dlm->reco.dead_node); status = -EEXIST; + } else if (ret == DLM_RECOVERING) { + mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n", + dlm->name, dlm->node_num); + goto again; } else { struct dlm_lock_resource *res; @@ -2156,7 +2402,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) mlog_entry("%u\n", dead_node); - mlog(0, "dead node is %u\n", dead_node); + mlog(0, "%s: dead node is %u\n", dlm->name, dead_node); spin_lock(&dlm->spinlock); dlm_node_iter_init(dlm->domain_map, &iter); @@ -2214,6 +2460,14 @@ retry: * another ENOMEM */ msleep(100); goto retry; + } else if (ret == EAGAIN) { + mlog(0, "%s: trying to start recovery of node " + "%u, but node %u is waiting for last recovery " + "to complete, backoff for a bit\n", dlm->name, + dead_node, nodenum); + /* TODO Look into replacing msleep with cond_resched() */ + msleep(100); + goto retry; } } @@ -2229,8 +2483,20 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) if (!dlm_grab(dlm)) return 0; - mlog(0, "node %u wants to recover node %u\n", - br->node_idx, br->dead_node); + spin_lock(&dlm->spinlock); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(0, "%s: node %u wants to recover node %u (%u:%u) " + "but this node is in finalize state, waiting on finalize2\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + spin_unlock(&dlm->spinlock); + return EAGAIN; + } + spin_unlock(&dlm->spinlock); + + mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); @@ -2252,8 +2518,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) "node %u changing it to %u\n", dlm->name, dlm->reco.dead_node, br->node_idx, br->dead_node); } - dlm->reco.new_master = br->node_idx; - dlm->reco.dead_node = br->dead_node; + dlm_set_reco_master(dlm, br->node_idx); + dlm_set_reco_dead_node(dlm, br->dead_node); if (!test_bit(br->dead_node, dlm->recovery_map)) { mlog(0, "recovery master %u sees %u as dead, but this " "node has not yet. marking %u as dead\n", @@ -2272,10 +2538,16 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) spin_unlock(&dlm->spinlock); dlm_kick_recovery_thread(dlm); + + mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + dlm_put(dlm); return 0; } +#define DLM_FINALIZE_STAGE2 0x01 static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) { int ret = 0; @@ -2283,25 +2555,31 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) struct dlm_node_iter iter; int nodenum; int status; + int stage = 1; - mlog(0, "finishing recovery for node %s:%u\n", - dlm->name, dlm->reco.dead_node); + mlog(0, "finishing recovery for node %s:%u, " + "stage %d\n", dlm->name, dlm->reco.dead_node, stage); spin_lock(&dlm->spinlock); dlm_node_iter_init(dlm->domain_map, &iter); spin_unlock(&dlm->spinlock); +stage2: memset(&fr, 0, sizeof(fr)); fr.node_idx = dlm->node_num; fr.dead_node = dlm->reco.dead_node; + if (stage == 2) + fr.flags |= DLM_FINALIZE_STAGE2; while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { if (nodenum == dlm->node_num) continue; ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, &fr, sizeof(fr), nodenum, &status); - if (ret >= 0) { + if (ret >= 0) ret = status; + if (ret < 0) { + mlog_errno(ret); if (dlm_is_host_down(ret)) { /* this has no effect on this recovery * session, so set the status to zero to @@ -2309,13 +2587,17 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) mlog(ML_ERROR, "node %u went down after this " "node finished recovery.\n", nodenum); ret = 0; + continue; } - } - if (ret < 0) { - mlog_errno(ret); break; } } + if (stage == 1) { + /* reset the node_iter back to the top and send finalize2 */ + iter.curnode = -1; + stage = 2; + goto stage2; + } return ret; } @@ -2324,14 +2606,19 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) { struct dlm_ctxt *dlm = data; struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; + int stage = 1; /* ok to return 0, domain has gone away */ if (!dlm_grab(dlm)) return 0; - mlog(0, "node %u finalizing recovery of node %u\n", - fr->node_idx, fr->dead_node); + if (fr->flags & DLM_FINALIZE_STAGE2) + stage = 2; + mlog(0, "%s: node %u finalizing recovery stage%d of " + "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, + fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); + spin_lock(&dlm->spinlock); if (dlm->reco.new_master != fr->node_idx) { @@ -2347,13 +2634,41 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) BUG(); } - dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); - - spin_unlock(&dlm->spinlock); + switch (stage) { + case 1: + dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(ML_ERROR, "%s: received finalize1 from " + "new master %u for dead node %u, but " + "this node has already received it!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + break; + case 2: + if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) { + mlog(ML_ERROR, "%s: received finalize2 from " + "new master %u for dead node %u, but " + "this node did not have finalize1!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + dlm_reset_recovery(dlm); + dlm_kick_recovery_thread(dlm); + break; + default: + BUG(); + } - dlm_reset_recovery(dlm); + mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", + dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master); - dlm_kick_recovery_thread(dlm); dlm_put(dlm); return 0; } diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 5be9d14f12cb..0c822f3ffb05 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -39,6 +39,7 @@ #include <linux/inet.h> #include <linux/timer.h> #include <linux/kthread.h> +#include <linux/delay.h> #include "cluster/heartbeat.h" @@ -53,6 +54,8 @@ #include "cluster/masklog.h" static int dlm_thread(void *data); +static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, + struct dlm_lock_resource *lockres); static void dlm_flush_asts(struct dlm_ctxt *dlm); @@ -80,7 +83,7 @@ repeat: } -static int __dlm_lockres_unused(struct dlm_lock_resource *res) +int __dlm_lockres_unused(struct dlm_lock_resource *res) { if (list_empty(&res->granted) && list_empty(&res->converting) && @@ -103,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); if (__dlm_lockres_unused(res)){ + /* For now, just keep any resource we master */ + if (res->owner == dlm->node_num) + { + if (!list_empty(&res->purge)) { + mlog(0, "we master %s:%.*s, but it is on " + "the purge list. Removing\n", + dlm->name, res->lockname.len, + res->lockname.name); + list_del_init(&res->purge); + dlm->purge_count--; + } + return; + } + if (list_empty(&res->purge)) { mlog(0, "putting lockres %.*s from purge list\n", res->lockname.len, res->lockname.name); @@ -110,10 +127,23 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, res->last_used = jiffies; list_add_tail(&res->purge, &dlm->purge_list); dlm->purge_count++; + + /* if this node is not the owner, there is + * no way to keep track of who the owner could be. + * unhash it to avoid serious problems. */ + if (res->owner != dlm->node_num) { + mlog(0, "%s:%.*s: doing immediate " + "purge of lockres owned by %u\n", + dlm->name, res->lockname.len, + res->lockname.name, res->owner); + + dlm_purge_lockres_now(dlm, res); + } } } else if (!list_empty(&res->purge)) { - mlog(0, "removing lockres %.*s from purge list\n", - res->lockname.len, res->lockname.name); + mlog(0, "removing lockres %.*s from purge list, " + "owner=%u\n", res->lockname.len, res->lockname.name, + res->owner); list_del_init(&res->purge); dlm->purge_count--; @@ -165,6 +195,7 @@ again: } else if (ret < 0) { mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", lockres->lockname.len, lockres->lockname.name); + msleep(100); goto again; } @@ -178,6 +209,24 @@ finish: __dlm_unhash_lockres(lockres); } +/* make an unused lockres go away immediately. + * as soon as the dlm spinlock is dropped, this lockres + * will not be found. kfree still happens on last put. */ +static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, + struct dlm_lock_resource *lockres) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&lockres->spinlock); + + BUG_ON(!__dlm_lockres_unused(lockres)); + + if (!list_empty(&lockres->purge)) { + list_del_init(&lockres->purge); + dlm->purge_count--; + } + __dlm_unhash_lockres(lockres); +} + static void dlm_run_purge_list(struct dlm_ctxt *dlm, int purge_now) { @@ -318,8 +367,7 @@ converting: target->ml.type = target->ml.convert_type; target->ml.convert_type = LKM_IVMODE; - list_del_init(&target->list); - list_add_tail(&target->list, &res->granted); + list_move_tail(&target->list, &res->granted); BUG_ON(!target->lksb); target->lksb->status = DLM_NORMAL; @@ -380,8 +428,7 @@ blocked: target->ml.type, target->ml.node); // target->ml.type is already correct - list_del_init(&target->list); - list_add_tail(&target->list, &res->granted); + list_move_tail(&target->list, &res->granted); BUG_ON(!target->lksb); target->lksb->status = DLM_NORMAL; @@ -422,6 +469,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) /* don't shuffle secondary queues */ if ((res->owner == dlm->node_num) && !(res->state & DLM_LOCK_RES_DIRTY)) { + /* ref for dirty_list */ + dlm_lockres_get(res); list_add_tail(&res->dirty, &dlm->dirty_list); res->state |= DLM_LOCK_RES_DIRTY; } @@ -606,6 +655,8 @@ static int dlm_thread(void *data) list_del_init(&res->dirty); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); + /* Drop dirty_list ref */ + dlm_lockres_put(res); /* lockres can be re-dirtied/re-added to the * dirty_list in this gap, but that is ok */ @@ -642,8 +693,9 @@ static int dlm_thread(void *data) * spinlock and do NOT have the dlm lock. * safe to reserve/queue asts and run the lists. */ - mlog(0, "calling dlm_shuffle_lists with dlm=%p, " - "res=%p\n", dlm, res); + mlog(0, "calling dlm_shuffle_lists with dlm=%s, " + "res=%.*s\n", dlm->name, + res->lockname.len, res->lockname.name); /* called while holding lockres lock */ dlm_shuffle_lists(dlm, res); @@ -657,6 +709,8 @@ in_progress: /* if the lock was in-progress, stick * it on the back of the list */ if (delay) { + /* ref for dirty_list */ + dlm_lockres_get(res); spin_lock(&res->spinlock); list_add_tail(&res->dirty, &dlm->dirty_list); res->state |= DLM_LOCK_RES_DIRTY; @@ -677,7 +731,7 @@ in_progress: /* yield and continue right away if there is more work to do */ if (!n) { - yield(); + cond_resched(); continue; } diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 7b1a27542674..b0c3134f4f70 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -271,8 +271,7 @@ void dlm_commit_pending_unlock(struct dlm_lock_resource *res, void dlm_commit_pending_cancel(struct dlm_lock_resource *res, struct dlm_lock *lock) { - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); lock->ml.convert_type = LKM_IVMODE; } @@ -319,6 +318,16 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); + if (owner == dlm->node_num) { + /* ended up trying to contact ourself. this means + * that the lockres had been remote but became local + * via a migration. just retry it, now as local */ + mlog(0, "%s:%.*s: this node became the master due to a " + "migration, re-evaluate now\n", dlm->name, + res->lockname.len, res->lockname.name); + return DLM_FORWARD; + } + memset(&unlock, 0, sizeof(unlock)); unlock.node_idx = dlm->node_num; unlock.flags = cpu_to_be32(flags); diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index 74ca4e5f9765..e641b084b343 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_context(struct qstr *name) u32 dlm_key; char *domain; - domain = kmalloc(name->len + 1, GFP_KERNEL); + domain = kmalloc(name->len + 1, GFP_NOFS); if (!domain) { mlog_errno(-ENOMEM); return ERR_PTR(-ENOMEM); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 64cd52860c87..4acd37286bdd 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -242,7 +242,7 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type, mlog_exit_void(); } -static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock); static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, struct ocfs2_dlm_debug *dlm_debug) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index eebc3cfa6be8..910a601b2e98 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -49,7 +49,7 @@ #include "buffer_head_io.h" -spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(trans_inc_lock); static int ocfs2_force_read_journal(struct inode *inode); static int ocfs2_recover_node(struct ocfs2_super *osb, @@ -222,8 +222,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list)); OCFS2_I(inode)->ip_handle = handle; - list_del(&(OCFS2_I(inode)->ip_handle_list)); - list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); + list_move_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); } static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle) diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index ee42765a8553..cf70fe2075b8 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -988,9 +988,7 @@ int ocfs2_request_mount_vote(struct ocfs2_super *osb) } bail: - if (request) - kfree(request); - + kfree(request); return status; } @@ -1021,9 +1019,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb) } bail: - if (request) - kfree(request); - + kfree(request); return status; } diff --git a/fs/open.c b/fs/open.c index 5fb16e5267dc..303f06d2a7b9 100644 --- a/fs/open.c +++ b/fs/open.c @@ -322,7 +322,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) error = locks_verify_truncate(inode, file, length); if (!error) - error = do_truncate(dentry, length, 0, file); + error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); out_putf: fput(file); out: diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 464e2bce0203..93a56bd4a2b7 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -1,5 +1,4 @@ -/* $Id: inode.c,v 1.15 2001/11/12 09:43:39 davem Exp $ - * openpromfs.c: /proc/openprom handling routines +/* inode.c: /proc/openprom handling routines * * Copyright (C) 1996-1999 Jakub Jelinek (jakub@redhat.com) * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) @@ -12,756 +11,245 @@ #include <linux/openprom_fs.h> #include <linux/init.h> #include <linux/slab.h> -#include <linux/smp_lock.h> +#include <linux/seq_file.h> #include <asm/openprom.h> #include <asm/oplib.h> +#include <asm/prom.h> #include <asm/uaccess.h> -#define ALIASES_NNODES 64 - -typedef struct { - u16 parent; - u16 next; - u16 child; - u16 first_prop; - u32 node; -} openpromfs_node; - -typedef struct { -#define OPP_STRING 0x10 -#define OPP_STRINGLIST 0x20 -#define OPP_BINARY 0x40 -#define OPP_HEXSTRING 0x80 -#define OPP_DIRTY 0x01 -#define OPP_QUOTED 0x02 -#define OPP_NOTQUOTED 0x04 -#define OPP_ASCIIZ 0x08 - u32 flag; - u32 alloclen; - u32 len; - char *value; - char name[8]; -} openprom_property; - -static openpromfs_node *nodes; -static int alloced; -static u16 last_node; -static u16 first_prop; -static u16 options = 0xffff; -static u16 aliases = 0xffff; -static int aliases_nodes; -static char *alias_names [ALIASES_NNODES]; - -#define OPENPROM_ROOT_INO 16 -#define OPENPROM_FIRST_INO OPENPROM_ROOT_INO -#define NODE(ino) nodes[ino - OPENPROM_FIRST_INO] -#define NODE2INO(node) (node + OPENPROM_FIRST_INO) -#define NODEP2INO(no) (no + OPENPROM_FIRST_INO + last_node) - -static int openpromfs_create (struct inode *, struct dentry *, int, struct nameidata *); -static int openpromfs_readdir(struct file *, void *, filldir_t); -static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd); -static int openpromfs_unlink (struct inode *, struct dentry *dentry); +static DEFINE_MUTEX(op_mutex); + +#define OPENPROM_ROOT_INO 0 + +enum op_inode_type { + op_inode_node, + op_inode_prop, +}; + +union op_inode_data { + struct device_node *node; + struct property *prop; +}; -static ssize_t nodenum_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +struct op_inode_info { + struct inode vfs_inode; + enum op_inode_type type; + union op_inode_data u; +}; + +static inline struct op_inode_info *OP_I(struct inode *inode) { - struct inode *inode = file->f_dentry->d_inode; - char buffer[10]; - - if (count < 0 || !inode->u.generic_ip) - return -EINVAL; - sprintf (buffer, "%8.8x\n", (u32)(long)(inode->u.generic_ip)); - if (file->f_pos >= 9) - return 0; - if (count > 9 - file->f_pos) - count = 9 - file->f_pos; - if (copy_to_user(buf, buffer + file->f_pos, count)) - return -EFAULT; - *ppos += count; - return count; + return container_of(inode, struct op_inode_info, vfs_inode); } -static ssize_t property_read(struct file *filp, char __user *buf, - size_t count, loff_t *ppos) +static int is_string(unsigned char *p, int len) { - struct inode *inode = filp->f_dentry->d_inode; - int i, j, k; - u32 node; - char *p, *s; - u32 *q; - openprom_property *op; - char buffer[64]; - - if (!filp->private_data) { - node = nodes[(u16)((long)inode->u.generic_ip)].node; - i = ((u32)(long)inode->u.generic_ip) >> 16; - if ((u16)((long)inode->u.generic_ip) == aliases) { - if (i >= aliases_nodes) - p = NULL; - else - p = alias_names [i]; - } else - for (p = prom_firstprop (node, buffer); - i && p && *p; - p = prom_nextprop (node, p, buffer), i--) - /* nothing */ ; - if (!p || !*p) - return -EIO; - i = prom_getproplen (node, p); - if (i < 0) { - if ((u16)((long)inode->u.generic_ip) == aliases) - i = 0; - else - return -EIO; - } - k = i; - if (i < 64) i = 64; - filp->private_data = kmalloc (sizeof (openprom_property) - + (j = strlen (p)) + 2 * i, - GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - op = (openprom_property *)filp->private_data; - op->flag = 0; - op->alloclen = 2 * i; - strcpy (op->name, p); - op->value = (char *)(((unsigned long)(op->name + j + 4)) & ~3); - op->len = k; - if (k && prom_getproperty (node, p, op->value, i) < 0) - return -EIO; - op->value [k] = 0; - if (k) { - for (s = NULL, p = op->value; p < op->value + k; p++) { - if ((*p >= ' ' && *p <= '~') || *p == '\n') { - op->flag |= OPP_STRING; - s = p; - continue; - } - if (p > op->value && !*p && s == p - 1) { - if (p < op->value + k - 1) - op->flag |= OPP_STRINGLIST; - else - op->flag |= OPP_ASCIIZ; - continue; - } - if (k == 1 && !*p) { - op->flag |= (OPP_STRING|OPP_ASCIIZ); - break; - } - op->flag &= ~(OPP_STRING|OPP_STRINGLIST); - if (k & 3) - op->flag |= OPP_HEXSTRING; - else - op->flag |= OPP_BINARY; - break; - } - if (op->flag & OPP_STRINGLIST) - op->flag &= ~(OPP_STRING); - if (op->flag & OPP_ASCIIZ) - op->len--; - } - } else - op = (openprom_property *)filp->private_data; - if (!count || !(op->len || (op->flag & OPP_ASCIIZ))) - return 0; - if (*ppos >= 0xffffff || count >= 0xffffff) - return -EINVAL; - if (op->flag & OPP_STRINGLIST) { - for (k = 0, p = op->value; p < op->value + op->len; p++) - if (!*p) - k++; - i = op->len + 4 * k + 3; - } else if (op->flag & OPP_STRING) { - i = op->len + 3; - } else if (op->flag & OPP_BINARY) { - i = (op->len * 9) >> 2; - } else { - i = (op->len << 1) + 1; - } - k = *ppos; - if (k >= i) return 0; - if (count > i - k) count = i - k; - if (op->flag & OPP_STRING) { - if (!k) { - if (put_user('\'', buf)) - return -EFAULT; - k++; - count--; - } + int i; - if (k + count >= i - 2) - j = i - 2 - k; - else - j = count; - - if (j >= 0) { - if (copy_to_user(buf + k - *ppos, - op->value + k - 1, j)) - return -EFAULT; - count -= j; - k += j; - } + for (i = 0; i < len; i++) { + unsigned char val = p[i]; - if (count) { - if (put_user('\'', &buf [k++ - *ppos])) - return -EFAULT; - } - if (count > 1) { - if (put_user('\n', &buf [k++ - *ppos])) - return -EFAULT; - } - } else if (op->flag & OPP_STRINGLIST) { - char *tmp; - - tmp = kmalloc (i, GFP_KERNEL); - if (!tmp) - return -ENOMEM; - - s = tmp; - *s++ = '\''; - for (p = op->value; p < op->value + op->len; p++) { - if (!*p) { - strcpy(s, "' + '"); - s += 5; - continue; - } - *s++ = *p; - } - strcpy(s, "'\n"); - - if (copy_to_user(buf, tmp + k, count)) - return -EFAULT; - - kfree(tmp); - k += count; - - } else if (op->flag & OPP_BINARY) { - char buffer[10]; - u32 *first, *last; - int first_off, last_cnt; - - first = ((u32 *)op->value) + k / 9; - first_off = k % 9; - last = ((u32 *)op->value) + (k + count - 1) / 9; - last_cnt = (k + count) % 9; - if (!last_cnt) last_cnt = 9; - - if (first == last) { - sprintf (buffer, "%08x.", *first); - if (copy_to_user(buf, buffer + first_off, - last_cnt - first_off)) - return -EFAULT; - buf += last_cnt - first_off; - } else { - for (q = first; q <= last; q++) { - sprintf (buffer, "%08x.", *q); - if (q == first) { - if (copy_to_user(buf, buffer + first_off, - 9 - first_off)) - return -EFAULT; - buf += 9 - first_off; - } else if (q == last) { - if (copy_to_user(buf, buffer, last_cnt)) - return -EFAULT; - buf += last_cnt; - } else { - if (copy_to_user(buf, buffer, 9)) - return -EFAULT; - buf += 9; - } - } - } + if ((i && !val) || + (val >= ' ' && val <= '~')) + continue; - if (last == (u32 *)(op->value + op->len - 4) && last_cnt == 9) { - if (put_user('\n', (buf - 1))) - return -EFAULT; - } + return 0; + } - k += count; + return 1; +} - } else if (op->flag & OPP_HEXSTRING) { - char buffer[3]; +static int property_show(struct seq_file *f, void *v) +{ + struct property *prop = f->private; + void *pval; + int len; - if ((k < i - 1) && (k & 1)) { - sprintf (buffer, "%02x", - (unsigned char) *(op->value + (k >> 1)) & 0xff); - if (put_user(buffer[1], &buf[k++ - *ppos])) - return -EFAULT; - count--; - } + len = prop->length; + pval = prop->value; - for (; (count > 1) && (k < i - 1); k += 2) { - sprintf (buffer, "%02x", - (unsigned char) *(op->value + (k >> 1)) & 0xff); - if (copy_to_user(buf + k - *ppos, buffer, 2)) - return -EFAULT; - count -= 2; - } + if (is_string(pval, len)) { + while (len > 0) { + int n = strlen(pval); - if (count && (k < i - 1)) { - sprintf (buffer, "%02x", - (unsigned char) *(op->value + (k >> 1)) & 0xff); - if (put_user(buffer[0], &buf[k++ - *ppos])) - return -EFAULT; - count--; - } + seq_printf(f, "%s", (char *) pval); - if (count) { - if (put_user('\n', &buf [k++ - *ppos])) - return -EFAULT; - } - } - count = k - *ppos; - *ppos = k; - return count; -} + /* Skip over the NULL byte too. */ + pval += n + 1; + len -= n + 1; -static ssize_t property_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - int i, j, k; - char *p; - u32 *q; - void *b; - openprom_property *op; - - if (*ppos >= 0xffffff || count >= 0xffffff) - return -EINVAL; - if (!filp->private_data) { - i = property_read (filp, NULL, 0, NULL); - if (i) - return i; - } - k = *ppos; - op = (openprom_property *)filp->private_data; - if (!(op->flag & OPP_STRING)) { - u32 *first, *last; - int first_off, last_cnt; - u32 mask, mask2; - char tmp [9]; - int forcelen = 0; - - j = k % 9; - for (i = 0; i < count; i++, j++) { - if (j == 9) j = 0; - if (!j) { - char ctmp; - if (get_user(ctmp, &buf[i])) - return -EFAULT; - if (ctmp != '.') { - if (ctmp != '\n') { - if (op->flag & OPP_BINARY) - return -EINVAL; - else - goto write_try_string; - } else { - count = i + 1; - forcelen = 1; - break; - } - } - } else { - char ctmp; - if (get_user(ctmp, &buf[i])) - return -EFAULT; - if (ctmp < '0' || - (ctmp > '9' && ctmp < 'A') || - (ctmp > 'F' && ctmp < 'a') || - ctmp > 'f') { - if (op->flag & OPP_BINARY) - return -EINVAL; - else - goto write_try_string; - } - } - } - op->flag |= OPP_BINARY; - tmp [8] = 0; - i = ((count + k + 8) / 9) << 2; - if (op->alloclen <= i) { - b = kmalloc (sizeof (openprom_property) + 2 * i, - GFP_KERNEL); - if (!b) - return -ENOMEM; - memcpy (b, filp->private_data, - sizeof (openprom_property) - + strlen (op->name) + op->alloclen); - memset (((char *)b) + sizeof (openprom_property) - + strlen (op->name) + op->alloclen, - 0, 2 * i - op->alloclen); - op = (openprom_property *)b; - op->alloclen = 2*i; - b = filp->private_data; - filp->private_data = (void *)op; - kfree (b); + if (len > 0) + seq_printf(f, " + "); } - first = ((u32 *)op->value) + (k / 9); - first_off = k % 9; - last = (u32 *)(op->value + i); - last_cnt = (k + count) % 9; - if (first + 1 == last) { - memset (tmp, '0', 8); - if (copy_from_user(tmp + first_off, buf, - (count + first_off > 8) ? - 8 - first_off : count)) - return -EFAULT; - mask = 0xffffffff; - mask2 = 0xffffffff; - for (j = 0; j < first_off; j++) - mask >>= 1; - for (j = 8 - count - first_off; j > 0; j--) - mask2 <<= 1; - mask &= mask2; - if (mask) { - *first &= ~mask; - *first |= simple_strtoul (tmp, NULL, 16); - op->flag |= OPP_DIRTY; + } else { + if (len & 3) { + while (len) { + len--; + if (len) + seq_printf(f, "%02x.", + *(unsigned char *) pval); + else + seq_printf(f, "%02x", + *(unsigned char *) pval); + pval++; } } else { - op->flag |= OPP_DIRTY; - for (q = first; q < last; q++) { - if (q == first) { - if (first_off < 8) { - memset (tmp, '0', 8); - if (copy_from_user(tmp + first_off, - buf, - 8 - first_off)) - return -EFAULT; - mask = 0xffffffff; - for (j = 0; j < first_off; j++) - mask >>= 1; - *q &= ~mask; - *q |= simple_strtoul (tmp,NULL,16); - } - buf += 9; - } else if ((q == last - 1) && last_cnt - && (last_cnt < 8)) { - memset (tmp, '0', 8); - if (copy_from_user(tmp, buf, last_cnt)) - return -EFAULT; - mask = 0xffffffff; - for (j = 0; j < 8 - last_cnt; j++) - mask <<= 1; - *q &= ~mask; - *q |= simple_strtoul (tmp, NULL, 16); - buf += last_cnt; - } else { - char tchars[17]; /* XXX yuck... */ - - if (copy_from_user(tchars, buf, 16)) - return -EFAULT; - *q = simple_strtoul (tchars, NULL, 16); - buf += 9; - } - } - } - if (!forcelen) { - if (op->len < i) - op->len = i; - } else - op->len = i; - *ppos += count; - } -write_try_string: - if (!(op->flag & OPP_BINARY)) { - if (!(op->flag & (OPP_QUOTED | OPP_NOTQUOTED))) { - char ctmp; - - /* No way, if somebody starts writing from the middle, - * we don't know whether he uses quotes around or not - */ - if (k > 0) - return -EINVAL; - if (get_user(ctmp, buf)) - return -EFAULT; - if (ctmp == '\'') { - op->flag |= OPP_QUOTED; - buf++; - count--; - (*ppos)++; - if (!count) { - op->flag |= OPP_STRING; - return 1; - } - } else - op->flag |= OPP_NOTQUOTED; - } - op->flag |= OPP_STRING; - if (op->alloclen <= count + *ppos) { - b = kmalloc (sizeof (openprom_property) - + 2 * (count + *ppos), GFP_KERNEL); - if (!b) - return -ENOMEM; - memcpy (b, filp->private_data, - sizeof (openprom_property) - + strlen (op->name) + op->alloclen); - memset (((char *)b) + sizeof (openprom_property) - + strlen (op->name) + op->alloclen, - 0, 2*(count - *ppos) - op->alloclen); - op = (openprom_property *)b; - op->alloclen = 2*(count + *ppos); - b = filp->private_data; - filp->private_data = (void *)op; - kfree (b); - } - p = op->value + *ppos - ((op->flag & OPP_QUOTED) ? 1 : 0); - if (copy_from_user(p, buf, count)) - return -EFAULT; - op->flag |= OPP_DIRTY; - for (i = 0; i < count; i++, p++) - if (*p == '\n') { - *p = 0; - break; + while (len >= 4) { + len -= 4; + + if (len) + seq_printf(f, "%08x.", + *(unsigned int *) pval); + else + seq_printf(f, "%08x", + *(unsigned int *) pval); + pval += 4; } - if (i < count) { - op->len = p - op->value; - *ppos += i + 1; - if ((p > op->value) && (op->flag & OPP_QUOTED) - && (*(p - 1) == '\'')) - op->len--; - } else { - if (p - op->value > op->len) - op->len = p - op->value; - *ppos += count; } } - return *ppos - k; + seq_printf(f, "\n"); + + return 0; } -int property_release (struct inode *inode, struct file *filp) +static void *property_start(struct seq_file *f, loff_t *pos) { - openprom_property *op = (openprom_property *)filp->private_data; - int error; - u32 node; - - if (!op) - return 0; - lock_kernel(); - node = nodes[(u16)((long)inode->u.generic_ip)].node; - if ((u16)((long)inode->u.generic_ip) == aliases) { - if ((op->flag & OPP_DIRTY) && (op->flag & OPP_STRING)) { - char *p = op->name; - int i = (op->value - op->name) - strlen (op->name) - 1; - op->value [op->len] = 0; - *(op->value - 1) = ' '; - if (i) { - for (p = op->value - i - 2; p >= op->name; p--) - p[i] = *p; - p = op->name + i; - } - memcpy (p - 8, "nvalias ", 8); - prom_feval (p - 8); - } - } else if (op->flag & OPP_DIRTY) { - if (op->flag & OPP_STRING) { - op->value [op->len] = 0; - error = prom_setprop (node, op->name, - op->value, op->len + 1); - if (error <= 0) - printk (KERN_WARNING "openpromfs: " - "Couldn't write property %s\n", - op->name); - } else if ((op->flag & OPP_BINARY) || !op->len) { - error = prom_setprop (node, op->name, - op->value, op->len); - if (error <= 0) - printk (KERN_WARNING "openpromfs: " - "Couldn't write property %s\n", - op->name); - } else { - printk (KERN_WARNING "openpromfs: " - "Unknown property type of %s\n", - op->name); - } + if (*pos == 0) + return pos; + return NULL; +} + +static void *property_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + return NULL; +} + +static void property_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static struct seq_operations property_op = { + .start = property_start, + .next = property_next, + .stop = property_stop, + .show = property_show +}; + +static int property_open(struct inode *inode, struct file *file) +{ + struct op_inode_info *oi = OP_I(inode); + int ret; + + BUG_ON(oi->type != op_inode_prop); + + ret = seq_open(file, &property_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = oi->u.prop; } - unlock_kernel(); - kfree (filp->private_data); - return 0; + return ret; } static const struct file_operations openpromfs_prop_ops = { - .read = property_read, - .write = property_write, - .release = property_release, + .open = property_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; -static const struct file_operations openpromfs_nodenum_ops = { - .read = nodenum_read, -}; +static int openpromfs_readdir(struct file *, void *, filldir_t); static const struct file_operations openprom_operations = { .read = generic_read_dir, .readdir = openpromfs_readdir, }; -static struct inode_operations openprom_alias_inode_operations = { - .create = openpromfs_create, - .lookup = openpromfs_lookup, - .unlink = openpromfs_unlink, -}; +static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *); static struct inode_operations openprom_inode_operations = { .lookup = openpromfs_lookup, }; -static int lookup_children(u16 n, const char * name, int len) -{ - int ret; - u16 node; - for (; n != 0xffff; n = nodes[n].next) { - node = nodes[n].child; - if (node != 0xffff) { - char buffer[128]; - int i; - char *p; - - while (node != 0xffff) { - if (prom_getname (nodes[node].node, - buffer, 128) >= 0) { - i = strlen (buffer); - if ((len == i) - && !strncmp (buffer, name, len)) - return NODE2INO(node); - p = strchr (buffer, '@'); - if (p && (len == p - buffer) - && !strncmp (buffer, name, len)) - return NODE2INO(node); - } - node = nodes[node].next; - } - } else - continue; - ret = lookup_children (nodes[n].child, name, len); - if (ret) return ret; - } - return 0; -} - -static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - int ino = 0; -#define OPFSL_DIR 0 -#define OPFSL_PROPERTY 1 -#define OPFSL_NODENUM 2 - int type = 0; - char buffer[128]; - char *p; + struct op_inode_info *ent_oi, *oi = OP_I(dir); + struct device_node *dp, *child; + struct property *prop; + enum op_inode_type ent_type; + union op_inode_data ent_data; const char *name; - u32 n; - u16 dirnode; - unsigned int len; - int i; struct inode *inode; - char buffer2[64]; + unsigned int ino; + int len; - inode = NULL; + BUG_ON(oi->type != op_inode_node); + + dp = oi->u.node; + name = dentry->d_name.name; len = dentry->d_name.len; - lock_kernel(); - if (name [0] == '.' && len == 5 && !strncmp (name + 1, "node", 4)) { - ino = NODEP2INO(NODE(dir->i_ino).first_prop); - type = OPFSL_NODENUM; - } - if (!ino) { - u16 node = NODE(dir->i_ino).child; - while (node != 0xffff) { - if (prom_getname (nodes[node].node, buffer, 128) >= 0) { - i = strlen (buffer); - if (len == i && !strncmp (buffer, name, len)) { - ino = NODE2INO(node); - type = OPFSL_DIR; - break; - } - p = strchr (buffer, '@'); - if (p && (len == p - buffer) - && !strncmp (buffer, name, len)) { - ino = NODE2INO(node); - type = OPFSL_DIR; - break; - } - } - node = nodes[node].next; - } - } - n = NODE(dir->i_ino).node; - dirnode = dir->i_ino - OPENPROM_FIRST_INO; - if (!ino) { - int j = NODEP2INO(NODE(dir->i_ino).first_prop); - if (dirnode != aliases) { - for (p = prom_firstprop (n, buffer2); - p && *p; - p = prom_nextprop (n, p, buffer2)) { - j++; - if ((len == strlen (p)) - && !strncmp (p, name, len)) { - ino = j; - type = OPFSL_PROPERTY; - break; - } - } - } else { - int k; - for (k = 0; k < aliases_nodes; k++) { - j++; - if (alias_names [k] - && (len == strlen (alias_names [k])) - && !strncmp (alias_names [k], name, len)) { - ino = j; - type = OPFSL_PROPERTY; - break; - } - } + + mutex_lock(&op_mutex); + + child = dp->child; + while (child) { + int n = strlen(child->path_component_name); + + if (len == n && + !strncmp(child->path_component_name, name, len)) { + ent_type = op_inode_node; + ent_data.node = child; + ino = child->unique_id; + goto found; } + child = child->sibling; } - if (!ino) { - ino = lookup_children (NODE(dir->i_ino).child, name, len); - if (ino) - type = OPFSL_DIR; - else { - unlock_kernel(); - return ERR_PTR(-ENOENT); + + prop = dp->properties; + while (prop) { + int n = strlen(prop->name); + + if (len == n && !strncmp(prop->name, name, len)) { + ent_type = op_inode_prop; + ent_data.prop = prop; + ino = prop->unique_id; + goto found; } + + prop = prop->next; } - inode = iget (dir->i_sb, ino); - unlock_kernel(); + + mutex_unlock(&op_mutex); + return ERR_PTR(-ENOENT); + +found: + inode = iget(dir->i_sb, ino); + mutex_unlock(&op_mutex); if (!inode) return ERR_PTR(-EINVAL); - switch (type) { - case OPFSL_DIR: + ent_oi = OP_I(inode); + ent_oi->type = ent_type; + ent_oi->u = ent_data; + + switch (ent_type) { + case op_inode_node: inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - if (ino == OPENPROM_FIRST_INO + aliases) { - inode->i_mode |= S_IWUSR; - inode->i_op = &openprom_alias_inode_operations; - } else - inode->i_op = &openprom_inode_operations; + inode->i_op = &openprom_inode_operations; inode->i_fop = &openprom_operations; inode->i_nlink = 2; break; - case OPFSL_NODENUM: - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &openpromfs_nodenum_ops; - inode->i_nlink = 1; - inode->u.generic_ip = (void *)(long)(n); - break; - case OPFSL_PROPERTY: - if ((dirnode == options) && (len == 17) - && !strncmp (name, "security-password", 17)) + case op_inode_prop: + if (!strcmp(dp->name, "options") && (len == 17) && + !strncmp (name, "security-password", 17)) inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; - else { + else inode->i_mode = S_IFREG | S_IRUGO; - if (dirnode == options || dirnode == aliases) { - if (len != 4 || strncmp (name, "name", 4)) - inode->i_mode |= S_IWUSR; - } - } inode->i_fop = &openpromfs_prop_ops; inode->i_nlink = 1; - if (inode->i_size < 0) - inode->i_size = 0; - inode->u.generic_ip = (void *)(long)(((u16)dirnode) | - (((u16)(ino - NODEP2INO(NODE(dir->i_ino).first_prop) - 1)) << 16)); + inode->i_size = ent_oi->u.prop->length; break; } @@ -775,237 +263,89 @@ static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentr static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir) { struct inode *inode = filp->f_dentry->d_inode; + struct op_inode_info *oi = OP_I(inode); + struct device_node *dp = oi->u.node; + struct device_node *child; + struct property *prop; unsigned int ino; - u32 n; - int i, j; - char buffer[128]; - u16 node; - char *p; - char buffer2[64]; - - lock_kernel(); + int i; + + mutex_lock(&op_mutex); ino = inode->i_ino; i = filp->f_pos; switch (i) { case 0: - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) goto out; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; i++; filp->f_pos++; /* fall thru */ case 1: - if (filldir(dirent, "..", 2, i, - (NODE(ino).parent == 0xffff) ? - OPENPROM_ROOT_INO : NODE2INO(NODE(ino).parent), DT_DIR) < 0) + if (filldir(dirent, "..", 2, i, + (dp->parent == NULL ? + OPENPROM_ROOT_INO : + dp->parent->unique_id), DT_DIR) < 0) goto out; i++; filp->f_pos++; /* fall thru */ default: i -= 2; - node = NODE(ino).child; - while (i && node != 0xffff) { - node = nodes[node].next; + + /* First, the children nodes as directories. */ + child = dp->child; + while (i && child) { + child = child->sibling; i--; } - while (node != 0xffff) { - if (prom_getname (nodes[node].node, buffer, 128) < 0) - goto out; - if (filldir(dirent, buffer, strlen(buffer), - filp->f_pos, NODE2INO(node), DT_DIR) < 0) + while (child) { + if (filldir(dirent, + child->path_component_name, + strlen(child->path_component_name), + filp->f_pos, child->unique_id, DT_DIR) < 0) goto out; + filp->f_pos++; - node = nodes[node].next; + child = child->sibling; } - j = NODEP2INO(NODE(ino).first_prop); - if (!i) { - if (filldir(dirent, ".node", 5, filp->f_pos, j, DT_REG) < 0) + + /* Next, the properties as files. */ + prop = dp->properties; + while (i && prop) { + prop = prop->next; + i--; + } + while (prop) { + if (filldir(dirent, prop->name, strlen(prop->name), + filp->f_pos, prop->unique_id, DT_REG) < 0) goto out; + filp->f_pos++; - } else - i--; - n = NODE(ino).node; - if (ino == OPENPROM_FIRST_INO + aliases) { - for (j++; i < aliases_nodes; i++, j++) { - if (alias_names [i]) { - if (filldir (dirent, alias_names [i], - strlen (alias_names [i]), - filp->f_pos, j, DT_REG) < 0) goto out; - filp->f_pos++; - } - } - } else { - for (p = prom_firstprop (n, buffer2); - p && *p; - p = prom_nextprop (n, p, buffer2)) { - j++; - if (i) i--; - else { - if (filldir(dirent, p, strlen(p), - filp->f_pos, j, DT_REG) < 0) - goto out; - filp->f_pos++; - } - } + prop = prop->next; } } out: - unlock_kernel(); - return 0; -} - -static int openpromfs_create (struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) -{ - char *p; - struct inode *inode; - - if (!dir) - return -ENOENT; - if (dentry->d_name.len > 256) - return -EINVAL; - p = kmalloc (dentry->d_name.len + 1, GFP_KERNEL); - if (!p) - return -ENOMEM; - strncpy (p, dentry->d_name.name, dentry->d_name.len); - p [dentry->d_name.len] = 0; - lock_kernel(); - if (aliases_nodes == ALIASES_NNODES) { - kfree(p); - unlock_kernel(); - return -EIO; - } - alias_names [aliases_nodes++] = p; - inode = iget (dir->i_sb, - NODEP2INO(NODE(dir->i_ino).first_prop) + aliases_nodes); - if (!inode) { - unlock_kernel(); - return -EINVAL; - } - inode->i_mode = S_IFREG | S_IRUGO | S_IWUSR; - inode->i_fop = &openpromfs_prop_ops; - inode->i_nlink = 1; - if (inode->i_size < 0) inode->i_size = 0; - inode->u.generic_ip = (void *)(long)(((u16)aliases) | - (((u16)(aliases_nodes - 1)) << 16)); - unlock_kernel(); - d_instantiate(dentry, inode); + mutex_unlock(&op_mutex); return 0; } -static int openpromfs_unlink (struct inode *dir, struct dentry *dentry) -{ - unsigned int len; - char *p; - const char *name; - int i; - - name = dentry->d_name.name; - len = dentry->d_name.len; - lock_kernel(); - for (i = 0; i < aliases_nodes; i++) - if ((strlen (alias_names [i]) == len) - && !strncmp (name, alias_names[i], len)) { - char buffer[512]; - - p = alias_names [i]; - alias_names [i] = NULL; - kfree (p); - strcpy (buffer, "nvunalias "); - memcpy (buffer + 10, name, len); - buffer [10 + len] = 0; - prom_feval (buffer); - } - unlock_kernel(); - return 0; -} +static kmem_cache_t *op_inode_cachep; -/* {{{ init section */ -static int __init check_space (u16 n) +static struct inode *openprom_alloc_inode(struct super_block *sb) { - unsigned long pages; + struct op_inode_info *oi; - if ((1 << alloced) * PAGE_SIZE < (n + 2) * sizeof(openpromfs_node)) { - pages = __get_free_pages (GFP_KERNEL, alloced + 1); - if (!pages) - return -1; + oi = kmem_cache_alloc(op_inode_cachep, SLAB_KERNEL); + if (!oi) + return NULL; - if (nodes) { - memcpy ((char *)pages, (char *)nodes, - (1 << alloced) * PAGE_SIZE); - free_pages ((unsigned long)nodes, alloced); - } - alloced++; - nodes = (openpromfs_node *)pages; - } - return 0; + return &oi->vfs_inode; } -static u16 __init get_nodes (u16 parent, u32 node) +static void openprom_destroy_inode(struct inode *inode) { - char *p; - u16 n = last_node++, i; - char buffer[64]; - - if (check_space (n) < 0) - return 0xffff; - nodes[n].parent = parent; - nodes[n].node = node; - nodes[n].next = 0xffff; - nodes[n].child = 0xffff; - nodes[n].first_prop = first_prop++; - if (!parent) { - char buffer[8]; - int j; - - if ((j = prom_getproperty (node, "name", buffer, 8)) >= 0) { - buffer[j] = 0; - if (!strcmp (buffer, "options")) - options = n; - else if (!strcmp (buffer, "aliases")) - aliases = n; - } - } - if (n != aliases) - for (p = prom_firstprop (node, buffer); - p && p != (char *)-1 && *p; - p = prom_nextprop (node, p, buffer)) - first_prop++; - else { - char *q; - for (p = prom_firstprop (node, buffer); - p && p != (char *)-1 && *p; - p = prom_nextprop (node, p, buffer)) { - if (aliases_nodes == ALIASES_NNODES) - break; - for (i = 0; i < aliases_nodes; i++) - if (!strcmp (p, alias_names [i])) - break; - if (i < aliases_nodes) - continue; - q = kmalloc (strlen (p) + 1, GFP_KERNEL); - if (!q) - return 0xffff; - strcpy (q, p); - alias_names [aliases_nodes++] = q; - } - first_prop += ALIASES_NNODES; - } - node = prom_getchild (node); - if (node) { - parent = get_nodes (n, node); - if (parent == 0xffff) - return 0xffff; - nodes[n].child = parent; - while ((node = prom_getsibling (node)) != 0) { - i = get_nodes (n, node); - if (i == 0xffff) - return 0xffff; - nodes[parent].next = i; - parent = i; - } - } - return n; + kmem_cache_free(op_inode_cachep, OP_I(inode)); } static void openprom_read_inode(struct inode * inode) @@ -1025,6 +365,8 @@ static int openprom_remount(struct super_block *sb, int *flags, char *data) } static struct super_operations openprom_sops = { + .alloc_inode = openprom_alloc_inode, + .destroy_inode = openprom_destroy_inode, .read_inode = openprom_read_inode, .statfs = simple_statfs, .remount_fs = openprom_remount, @@ -1032,7 +374,8 @@ static struct super_operations openprom_sops = { static int openprom_fill_super(struct super_block *s, void *data, int silent) { - struct inode * root_inode; + struct inode *root_inode; + struct op_inode_info *oi; s->s_flags |= MS_NOATIME; s->s_blocksize = 1024; @@ -1043,6 +386,11 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent) root_inode = iget(s, OPENPROM_ROOT_INO); if (!root_inode) goto out_no_root; + + oi = OP_I(root_inode); + oi->type = op_inode_node; + oi->u.node = of_find_node_by_path("/"); + s->s_root = d_alloc_root(root_inode); if (!s->s_root) goto out_no_root; @@ -1067,29 +415,39 @@ static struct file_system_type openprom_fs_type = { .kill_sb = kill_anon_super, }; +static void op_inode_init_once(void *data, kmem_cache_t * cachep, unsigned long flags) +{ + struct op_inode_info *oi = (struct op_inode_info *) data; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(&oi->vfs_inode); +} + static int __init init_openprom_fs(void) { - nodes = (openpromfs_node *)__get_free_pages(GFP_KERNEL, 0); - if (!nodes) { - printk (KERN_WARNING "openpromfs: can't get free page\n"); - return -EIO; - } - if (get_nodes (0xffff, prom_root_node) == 0xffff) { - printk (KERN_WARNING "openpromfs: couldn't setup tree\n"); - return -EIO; - } - nodes[last_node].first_prop = first_prop; - return register_filesystem(&openprom_fs_type); + int err; + + op_inode_cachep = kmem_cache_create("op_inode_cache", + sizeof(struct op_inode_info), + 0, + (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD), + op_inode_init_once, NULL); + if (!op_inode_cachep) + return -ENOMEM; + + err = register_filesystem(&openprom_fs_type); + if (err) + kmem_cache_destroy(op_inode_cachep); + + return err; } static void __exit exit_openprom_fs(void) { - int i; unregister_filesystem(&openprom_fs_type); - free_pages ((unsigned long)nodes, alloced); - for (i = 0; i < aliases_nodes; i++) - kfree (alias_names [i]); - nodes = NULL; + kmem_cache_destroy(op_inode_cachep); } module_init(init_openprom_fs) diff --git a/fs/pnode.c b/fs/pnode.c index 37b568ed0e05..da42ee61c1df 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -53,8 +53,7 @@ static int do_make_slave(struct vfsmount *mnt) if (master) { list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) slave_mnt->mnt_master = master; - list_del(&mnt->mnt_slave); - list_add(&mnt->mnt_slave, &master->mnt_slave_list); + list_move(&mnt->mnt_slave, &master->mnt_slave_list); list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); INIT_LIST_HEAD(&mnt->mnt_slave_list); } else { @@ -283,10 +282,8 @@ static void __propagate_umount(struct vfsmount *mnt) * umount the child only if the child has no * other children */ - if (child && list_empty(&child->mnt_mounts)) { - list_del(&child->mnt_hash); - list_add_tail(&child->mnt_hash, &mnt->mnt_hash); - } + if (child && list_empty(&child->mnt_mounts)) + list_move_tail(&child->mnt_hash, &mnt->mnt_hash); } } diff --git a/fs/proc/base.c b/fs/proc/base.c index 6afff725a8c9..6ba7785319de 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -74,6 +74,16 @@ #include <linux/poll.h> #include "internal.h" +/* NOTE: + * Implementing inode permission operations in /proc is almost + * certainly an error. Permission checks need to happen during + * each system call not at open time. The reason is that most of + * what we wish to check for permissions in /proc varies at runtime. + * + * The classic example of a problem is opening file descriptors + * in /proc for a task before it execs a suid executable. + */ + /* * For hysterical raisins we keep the same inumbers as in the old procfs. * Feel free to change the macro below - just keep the range distinct from @@ -121,6 +131,8 @@ enum pid_directory_inos { PROC_TGID_ATTR_PREV, PROC_TGID_ATTR_EXEC, PROC_TGID_ATTR_FSCREATE, + PROC_TGID_ATTR_KEYCREATE, + PROC_TGID_ATTR_SOCKCREATE, #endif #ifdef CONFIG_AUDITSYSCALL PROC_TGID_LOGINUID, @@ -162,6 +174,8 @@ enum pid_directory_inos { PROC_TID_ATTR_PREV, PROC_TID_ATTR_EXEC, PROC_TID_ATTR_FSCREATE, + PROC_TID_ATTR_KEYCREATE, + PROC_TID_ATTR_SOCKCREATE, #endif #ifdef CONFIG_AUDITSYSCALL PROC_TID_LOGINUID, @@ -173,6 +187,9 @@ enum pid_directory_inos { PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ }; +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 10 + struct pid_entry { int type; int len; @@ -275,6 +292,8 @@ static struct pid_entry tgid_attr_stuff[] = { E(PROC_TGID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), E(PROC_TGID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), {0,0,NULL,0} }; static struct pid_entry tid_attr_stuff[] = { @@ -282,6 +301,8 @@ static struct pid_entry tid_attr_stuff[] = { E(PROC_TID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), E(PROC_TID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), E(PROC_TID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), {0,0,NULL,0} }; #endif @@ -290,12 +311,15 @@ static struct pid_entry tid_attr_stuff[] = { static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct task_struct *task = proc_task(inode); - struct files_struct *files; + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; struct file *file; - int fd = proc_type(inode) - PROC_TID_FD_DIR; + int fd = proc_fd(inode); - files = get_files_struct(task); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } if (files) { /* * We are not taking a ref to the file structure, so we must @@ -327,29 +351,33 @@ static struct fs_struct *get_fs_struct(struct task_struct *task) return fs; } -static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +static int get_nr_threads(struct task_struct *tsk) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); - int result = -ENOENT; - if (fs) { - read_lock(&fs->lock); - *mnt = mntget(fs->pwdmnt); - *dentry = dget(fs->pwd); - read_unlock(&fs->lock); - result = 0; - put_fs_struct(fs); + /* Must be called with the rcu_read_lock held */ + unsigned long flags; + int count = 0; + + if (lock_task_sighand(tsk, &flags)) { + count = atomic_read(&tsk->signal->count); + unlock_task_sighand(tsk, &flags); } - return result; + return count; } -static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; + + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); + } if (fs) { read_lock(&fs->lock); - *mnt = mntget(fs->rootmnt); - *dentry = dget(fs->root); + *mnt = mntget(fs->pwdmnt); + *dentry = dget(fs->pwd); read_unlock(&fs->lock); result = 0; put_fs_struct(fs); @@ -357,42 +385,16 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf return result; } - -/* Same as proc_root_link, but this addionally tries to get fs from other - * threads in the group */ -static int proc_task_root_link(struct inode *inode, struct dentry **dentry, - struct vfsmount **mnt) +static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs; + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; - struct task_struct *leader = proc_task(inode); - task_lock(leader); - fs = leader->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(leader); - } else { - /* Try to get fs from other threads */ - task_unlock(leader); - read_lock(&tasklist_lock); - if (pid_alive(leader)) { - struct task_struct *task = leader; - - while ((task = next_thread(task)) != leader) { - task_lock(task); - fs = task->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(task); - break; - } - task_unlock(task); - } - } - read_unlock(&tasklist_lock); + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); } - if (fs) { read_lock(&fs->lock); *mnt = mntget(fs->rootmnt); @@ -404,7 +406,6 @@ static int proc_task_root_link(struct inode *inode, struct dentry **dentry, return result; } - #define MAY_PTRACE(task) \ (task == current || \ (task->parent == current && \ @@ -535,142 +536,22 @@ static int proc_oom_score(struct task_struct *task, char *buffer) /************************************************************************/ /* permission checks */ - -/* If the process being read is separated by chroot from the reading process, - * don't let the reader access the threads. - * - * note: this does dput(root) and mntput(vfsmnt) on exit. - */ -static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt) -{ - struct dentry *de, *base; - struct vfsmount *our_vfsmnt, *mnt; - int res = 0; - - read_lock(¤t->fs->lock); - our_vfsmnt = mntget(current->fs->rootmnt); - base = dget(current->fs->root); - read_unlock(¤t->fs->lock); - - spin_lock(&vfsmount_lock); - de = root; - mnt = vfsmnt; - - while (mnt != our_vfsmnt) { - if (mnt == mnt->mnt_parent) - goto out; - de = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - } - - if (!is_subdir(de, base)) - goto out; - spin_unlock(&vfsmount_lock); - -exit: - dput(base); - mntput(our_vfsmnt); - dput(root); - mntput(vfsmnt); - return res; -out: - spin_unlock(&vfsmount_lock); - res = -EACCES; - goto exit; -} - -static int proc_check_root(struct inode *inode) -{ - struct dentry *root; - struct vfsmount *vfsmnt; - - if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */ - return -ENOENT; - return proc_check_chroot(root, vfsmnt); -} - -static int proc_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - if (generic_permission(inode, mask, NULL) != 0) - return -EACCES; - return proc_check_root(inode); -} - -static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - struct dentry *root; - struct vfsmount *vfsmnt; - - if (generic_permission(inode, mask, NULL) != 0) - return -EACCES; - - if (proc_task_root_link(inode, &root, &vfsmnt)) - return -ENOENT; - - return proc_check_chroot(root, vfsmnt); -} - -extern struct seq_operations proc_pid_maps_op; -static int maps_open(struct inode *inode, struct file *file) -{ - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_maps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; - } - return ret; -} - -static struct file_operations proc_maps_operations = { - .open = maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#ifdef CONFIG_NUMA -extern struct seq_operations proc_pid_numa_maps_op; -static int numa_maps_open(struct inode *inode, struct file *file) -{ - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_numa_maps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; - } - return ret; -} - -static struct file_operations proc_numa_maps_operations = { - .open = numa_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - -#ifdef CONFIG_MMU -extern struct seq_operations proc_pid_smaps_op; -static int smaps_open(struct inode *inode, struct file *file) +static int proc_fd_access_allowed(struct inode *inode) { - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_smaps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; + struct task_struct *task; + int allowed = 0; + /* Allow access to a task's file descriptors if it is us or we + * may use ptrace attach to the process and find out that + * information. + */ + task = get_proc_task(inode); + if (task) { + allowed = ptrace_may_attach(task); + put_task_struct(task); } - return ret; + return allowed; } -static struct file_operations proc_smaps_operations = { - .open = smaps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - extern struct seq_operations mounts_op; struct proc_mounts { struct seq_file m; @@ -679,16 +560,19 @@ struct proc_mounts { static int mounts_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); - struct namespace *namespace; + struct task_struct *task = get_proc_task(inode); + struct namespace *namespace = NULL; struct proc_mounts *p; int ret = -EINVAL; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) { ret = -ENOMEM; @@ -745,17 +629,21 @@ static struct file_operations proc_mounts_operations = { extern struct seq_operations mountstats_op; static int mountstats_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); int ret = seq_open(file, &mountstats_op); if (!ret) { struct seq_file *m = file->private_data; - struct namespace *namespace; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + struct namespace *namespace = NULL; + struct task_struct *task = get_proc_task(inode); + + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) m->private = namespace; @@ -782,18 +670,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf, struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PROC_BLOCK_SIZE) count = PROC_BLOCK_SIZE; + + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = PROC_I(inode)->op.proc_read(task, (char*)page); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -810,12 +707,15 @@ static int mem_open(struct inode* inode, struct file* file) static ssize_t mem_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); char *page; unsigned long src = *ppos; int ret = -ESRCH; struct mm_struct *mm; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) goto out; @@ -865,6 +765,8 @@ out_put: out_free: free_page((unsigned long) page); out: + put_task_struct(task); +out_no_task: return ret; } @@ -877,15 +779,20 @@ static ssize_t mem_write(struct file * file, const char * buf, { int copied = 0; char *page; - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); unsigned long dst = *ppos; + copied = -ESRCH; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) - return -ESRCH; + goto out; + copied = -ENOMEM; page = (char *)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; while (count > 0) { int this_len, retval; @@ -908,6 +815,9 @@ static ssize_t mem_write(struct file * file, const char * buf, } *ppos = dst; free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return copied; } #endif @@ -938,13 +848,18 @@ static struct file_operations proc_mem_operations = { static ssize_t oom_adjust_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[8]; + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + char buffer[PROC_NUMBUF]; size_t len; - int oom_adjust = task->oomkilladj; + int oom_adjust; loff_t __ppos = *ppos; - len = sprintf(buffer, "%i\n", oom_adjust); + if (!task) + return -ESRCH; + oom_adjust = task->oomkilladj; + put_task_struct(task); + + len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); if (__ppos >= len) return 0; if (count > len-__ppos) @@ -958,15 +873,15 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, static ssize_t oom_adjust_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[8], *end; + struct task_struct *task; + char buffer[PROC_NUMBUF], *end; int oom_adjust; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - memset(buffer, 0, 8); - if (count > 6) - count = 6; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; oom_adjust = simple_strtol(buffer, &end, 0); @@ -974,7 +889,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, return -EINVAL; if (*end == '\n') end++; + task = get_proc_task(file->f_dentry->d_inode); + if (!task) + return -ESRCH; task->oomkilladj = oom_adjust; + put_task_struct(task); if (end - buffer == 0) return -EIO; return end - buffer; @@ -985,22 +904,21 @@ static struct file_operations proc_oom_adjust_operations = { .write = oom_adjust_write, }; -static struct inode_operations proc_mem_inode_operations = { - .permission = proc_permission, -}; - #ifdef CONFIG_AUDITSYSCALL #define TMPBUFLEN 21 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file->f_dentry->d_inode; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; + if (!task) + return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_loginuid(task->audit_context)); + put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -1010,13 +928,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, struct inode * inode = file->f_dentry->d_inode; char *page, *tmp; ssize_t length; - struct task_struct *task = proc_task(inode); uid_t loginuid; if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; - if (current != task) + if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) return -EPERM; if (count >= PAGE_SIZE) @@ -1040,7 +957,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(task, loginuid); + length = audit_set_loginuid(current, loginuid); if (likely(length == 0)) length = count; @@ -1059,13 +976,16 @@ static struct file_operations proc_loginuid_operations = { static ssize_t seccomp_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20]; loff_t __ppos = *ppos; size_t len; + if (!tsk) + return -ESRCH; /* no need to print the trailing zero, so use only len */ len = sprintf(__buf, "%u\n", tsk->seccomp.mode); + put_task_struct(tsk); if (__ppos >= len) return 0; if (count > len - __ppos) @@ -1079,29 +999,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf, static ssize_t seccomp_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20], *end; unsigned int seccomp_mode; + ssize_t result; + + result = -ESRCH; + if (!tsk) + goto out_no_task; /* can set it only once to be even more secure */ + result = -EPERM; if (unlikely(tsk->seccomp.mode)) - return -EPERM; + goto out; + result = -EFAULT; memset(__buf, 0, sizeof(__buf)); count = min(count, sizeof(__buf) - 1); if (copy_from_user(__buf, buf, count)) - return -EFAULT; + goto out; + seccomp_mode = simple_strtoul(__buf, &end, 0); if (*end == '\n') end++; + result = -EINVAL; if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { tsk->seccomp.mode = seccomp_mode; set_tsk_thread_flag(tsk, TIF_SECCOMP); } else - return -EINVAL; + goto out; + result = -EIO; if (unlikely(!(end - __buf))) - return -EIO; - return end - __buf; + goto out; + result = end - __buf; +out: + put_task_struct(tsk); +out_no_task: + return result; } static struct file_operations proc_seccomp_operations = { @@ -1118,10 +1052,8 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) /* We don't need a base pointer in the /proc filesystem */ path_release(nd); - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) - goto out; - error = proc_check_root(inode); - if (error) + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt); @@ -1163,12 +1095,8 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b struct dentry *de; struct vfsmount *mnt = NULL; - lock_kernel(); - - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) - goto out; - error = proc_check_root(inode); - if (error) + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt); @@ -1179,7 +1107,6 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b dput(de); mntput(mnt); out: - unlock_kernel(); return error; } @@ -1188,21 +1115,20 @@ static struct inode_operations proc_pid_link_inode_operations = { .follow_link = proc_pid_follow_link }; -#define NUMBUF 10 - static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; - struct task_struct *p = proc_task(inode); + struct dentry *dentry = filp->f_dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); unsigned int fd, tid, ino; int retval; - char buf[NUMBUF]; + char buf[PROC_NUMBUF]; struct files_struct * files; struct fdtable *fdt; retval = -ENOENT; - if (!pid_alive(p)) - goto out; + if (!p) + goto out_no_task; retval = 0; tid = p->pid; @@ -1213,7 +1139,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) goto out; filp->f_pos++; case 1: - ino = fake_ino(tid, PROC_TID_INO); + ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) goto out; filp->f_pos++; @@ -1232,7 +1158,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) continue; rcu_read_unlock(); - j = NUMBUF; + j = PROC_NUMBUF; i = fd; do { j--; @@ -1241,7 +1167,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) } while (i); ino = fake_ino(tid, PROC_TID_FD_DIR + fd); - if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) { + if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) { rcu_read_lock(); break; } @@ -1251,6 +1177,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) put_files_struct(files); } out: + put_task_struct(p); +out_no_task: return retval; } @@ -1262,16 +1190,18 @@ static int proc_pident_readdir(struct file *filp, int pid; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); struct pid_entry *p; ino_t ino; int ret; ret = -ENOENT; - if (!pid_alive(proc_task(inode))) + if (!task) goto out; ret = 0; - pid = proc_task(inode)->pid; + pid = task->pid; + put_task_struct(task); i = filp->f_pos; switch (i) { case 0: @@ -1354,22 +1284,19 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st /* Common stuff */ ei = PROC_I(inode); - ei->task = NULL; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_ino = fake_ino(task->pid, ino); - if (!pid_alive(task)) - goto out_unlock; - /* * grab the reference to task. */ - get_task_struct(task); - ei->task = task; - ei->type = ino; + ei->pid = get_pid(task->pids[PIDTYPE_PID].pid); + if (!ei->pid) + goto out_unlock; + inode->i_uid = 0; inode->i_gid = 0; - if (ino == PROC_TGID_INO || ino == PROC_TID_INO || task_dumpable(task)) { + if (task_dumpable(task)) { inode->i_uid = task->euid; inode->i_gid = task->egid; } @@ -1379,7 +1306,6 @@ out: return inode; out_unlock: - ei->pde = NULL; iput(inode); return NULL; } @@ -1393,13 +1319,21 @@ out_unlock: * * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. + * + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. */ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); - if (pid_alive(task)) { - if (proc_type(inode) == PROC_TGID_INO || proc_type(inode) == PROC_TID_INO || task_dumpable(task)) { + struct task_struct *task = get_proc_task(inode); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { inode->i_uid = task->euid; inode->i_gid = task->egid; } else { @@ -1407,59 +1341,75 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = 0; } security_task_to_inode(task, inode); + put_task_struct(task); return 1; } d_drop(dentry); return 0; } +static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task; + generic_fillattr(inode, stat); + + rcu_read_lock(); + stat->uid = 0; + stat->gid = 0; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + stat->uid = task->euid; + stat->gid = task->egid; + } + } + rcu_read_unlock(); + return 0; +} + static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); - int fd = proc_type(inode) - PROC_TID_FD_DIR; + struct task_struct *task = get_proc_task(inode); + int fd = proc_fd(inode); struct files_struct *files; - files = get_files_struct(task); - if (files) { - rcu_read_lock(); - if (fcheck_files(files, fd)) { + if (task) { + files = get_files_struct(task); + if (files) { + rcu_read_lock(); + if (fcheck_files(files, fd)) { + rcu_read_unlock(); + put_files_struct(files); + if (task_dumpable(task)) { + inode->i_uid = task->euid; + inode->i_gid = task->egid; + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } rcu_read_unlock(); put_files_struct(files); - if (task_dumpable(task)) { - inode->i_uid = task->euid; - inode->i_gid = task->egid; - } else { - inode->i_uid = 0; - inode->i_gid = 0; - } - security_task_to_inode(task, inode); - return 1; } - rcu_read_unlock(); - put_files_struct(files); + put_task_struct(task); } d_drop(dentry); return 0; } -static void pid_base_iput(struct dentry *dentry, struct inode *inode) -{ - struct task_struct *task = proc_task(inode); - spin_lock(&task->proc_lock); - if (task->proc_dentry == dentry) - task->proc_dentry = NULL; - spin_unlock(&task->proc_lock); - iput(inode); -} - static int pid_delete_dentry(struct dentry * dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return !pid_alive(proc_task(dentry->d_inode)); + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; } static struct dentry_operations tid_fd_dentry_operations = @@ -1474,13 +1424,6 @@ static struct dentry_operations pid_dentry_operations = .d_delete = pid_delete_dentry, }; -static struct dentry_operations pid_base_dentry_operations = -{ - .d_revalidate = pid_revalidate, - .d_iput = pid_base_iput, - .d_delete = pid_delete_dentry, -}; - /* Lookups */ static unsigned name_to_int(struct dentry *dentry) @@ -1508,22 +1451,24 @@ out: /* SMP-safe */ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) { - struct task_struct *task = proc_task(dir); + struct task_struct *task = get_proc_task(dir); unsigned fd = name_to_int(dentry); + struct dentry *result = ERR_PTR(-ENOENT); struct file * file; struct files_struct * files; struct inode *inode; struct proc_inode *ei; + if (!task) + goto out_no_task; if (fd == ~0U) goto out; - if (!pid_alive(task)) - goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd); if (!inode) goto out; ei = PROC_I(inode); + ei->fd = fd; files = get_files_struct(task); if (!files) goto out_unlock; @@ -1548,19 +1493,25 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, ei->op.proc_get_link = proc_fd_link; dentry->d_op = &tid_fd_dentry_operations; d_add(dentry, inode); - return NULL; + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, NULL)) + result = NULL; +out: + put_task_struct(task); +out_no_task: + return result; out_unlock2: spin_unlock(&files->file_lock); put_files_struct(files); out_unlock: iput(inode); -out: - return ERR_PTR(-ENOENT); + goto out; } static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir); static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd); +static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); static struct file_operations proc_fd_operations = { .read = generic_read_dir, @@ -1577,12 +1528,11 @@ static struct file_operations proc_task_operations = { */ static struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, - .permission = proc_permission, }; static struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, - .permission = proc_task_permission, + .getattr = proc_task_getattr, }; #ifdef CONFIG_SECURITY @@ -1592,12 +1542,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = security_getprocattr(task, (char*)file->f_dentry->d_name.name, @@ -1605,6 +1560,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1614,26 +1572,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, struct inode * inode = file->f_dentry->d_inode; char *page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; - if (*ppos != 0) { - /* No partial writes. */ - return -EINVAL; - } + + /* No partial writes. */ + length = -EINVAL; + if (*ppos != 0) + goto out; + + length = -ENOMEM; page = (char*)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; + length = -EFAULT; if (copy_from_user(page, buf, count)) - goto out; + goto out_free; length = security_setprocattr(task, (char*)file->f_dentry->d_name.name, (void*)page, count); -out: +out_free: free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1648,24 +1616,22 @@ static struct file_operations proc_tgid_attr_operations; static struct inode_operations proc_tgid_attr_inode_operations; #endif -static int get_tid_list(int index, unsigned int *tids, struct inode *dir); - /* SMP-safe */ static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, struct pid_entry *ents) { struct inode *inode; - int error; - struct task_struct *task = proc_task(dir); + struct dentry *error; + struct task_struct *task = get_proc_task(dir); struct pid_entry *p; struct proc_inode *ei; - error = -ENOENT; + error = ERR_PTR(-ENOENT); inode = NULL; - if (!pid_alive(task)) - goto out; + if (!task) + goto out_no_task; for (p = ents; p->name; p++) { if (p->len != dentry->d_name.len) @@ -1676,7 +1642,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir, if (!p->name) goto out; - error = -EINVAL; + error = ERR_PTR(-EINVAL); inode = proc_pid_make_inode(dir->i_sb, task, p->type); if (!inode) goto out; @@ -1689,7 +1655,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir, */ switch(p->type) { case PROC_TGID_TASK: - inode->i_nlink = 2 + get_tid_list(2, NULL, dir); + inode->i_nlink = 2; inode->i_op = &proc_task_inode_operations; inode->i_fop = &proc_task_operations; break; @@ -1759,7 +1725,6 @@ static struct dentry *proc_pident_lookup(struct inode *dir, #endif case PROC_TID_MEM: case PROC_TGID_MEM: - inode->i_op = &proc_mem_inode_operations; inode->i_fop = &proc_mem_operations; break; #ifdef CONFIG_SECCOMP @@ -1801,6 +1766,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir, case PROC_TGID_ATTR_EXEC: case PROC_TID_ATTR_FSCREATE: case PROC_TGID_ATTR_FSCREATE: + case PROC_TID_ATTR_KEYCREATE: + case PROC_TGID_ATTR_KEYCREATE: + case PROC_TID_ATTR_SOCKCREATE: + case PROC_TGID_ATTR_SOCKCREATE: inode->i_fop = &proc_pid_attr_operations; break; #endif @@ -1842,14 +1811,18 @@ static struct dentry *proc_pident_lookup(struct inode *dir, default: printk("procfs: impossible type (%d)",p->type); iput(inode); - return ERR_PTR(-EINVAL); + error = ERR_PTR(-EINVAL); + goto out; } dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); - return NULL; - + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; out: - return ERR_PTR(error); + put_task_struct(task); +out_no_task: + return error; } static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ @@ -1872,10 +1845,12 @@ static struct file_operations proc_tid_base_operations = { static struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, }; #ifdef CONFIG_SECURITY @@ -1917,10 +1892,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir, static struct inode_operations proc_tgid_attr_inode_operations = { .lookup = proc_tgid_attr_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_attr_inode_operations = { .lookup = proc_tid_attr_lookup, + .getattr = pid_getattr, }; #endif @@ -1930,14 +1907,14 @@ static struct inode_operations proc_tid_attr_inode_operations = { static int proc_self_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - char tmp[30]; + char tmp[PROC_NUMBUF]; sprintf(tmp, "%d", current->tgid); return vfs_readlink(dentry,buffer,buflen,tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { - char tmp[30]; + char tmp[PROC_NUMBUF]; sprintf(tmp, "%d", current->tgid); return ERR_PTR(vfs_follow_link(nd,tmp)); } @@ -1948,67 +1925,80 @@ static struct inode_operations proc_self_inode_operations = { }; /** - * proc_pid_unhash - Unhash /proc/@pid entry from the dcache. - * @p: task that should be flushed. + * proc_flush_task - Remove dcache entries for @task from the /proc dcache. + * + * @task: task that should be flushed. + * + * Looks in the dcache for + * /proc/@pid + * /proc/@tgid/task/@pid + * if either directory is present flushes it and all of it'ts children + * from the dcache. * - * Drops the /proc/@pid dcache entry from the hash chains. + * It is safe and reasonable to cache /proc entries for a task until + * that task exits. After that they just clog up the dcache with + * useless entries, possibly causing useful dcache entries to be + * flushed instead. This routine is proved to flush those useless + * dcache entries at process exit time. * - * Dropping /proc/@pid entries and detach_pid must be synchroneous, - * otherwise e.g. /proc/@pid/exe might point to the wrong executable, - * if the pid value is immediately reused. This is enforced by - * - caller must acquire spin_lock(p->proc_lock) - * - must be called before detach_pid() - * - proc_pid_lookup acquires proc_lock, and checks that - * the target is not dead by looking at the attach count - * of PIDTYPE_PID. + * NOTE: This routine is just an optimization so it does not guarantee + * that no dcache entries will exist at process exit time it + * just makes it very unlikely that any will persist. */ - -struct dentry *proc_pid_unhash(struct task_struct *p) +void proc_flush_task(struct task_struct *task) { - struct dentry *proc_dentry; + struct dentry *dentry, *leader, *dir; + char buf[PROC_NUMBUF]; + struct qstr name; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->pid); + dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); + } - proc_dentry = p->proc_dentry; - if (proc_dentry != NULL) { + if (thread_group_leader(task)) + goto out; - spin_lock(&dcache_lock); - spin_lock(&proc_dentry->d_lock); - if (!d_unhashed(proc_dentry)) { - dget_locked(proc_dentry); - __d_drop(proc_dentry); - spin_unlock(&proc_dentry->d_lock); - } else { - spin_unlock(&proc_dentry->d_lock); - proc_dentry = NULL; - } - spin_unlock(&dcache_lock); - } - return proc_dentry; -} + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->tgid); + leader = d_hash_and_lookup(proc_mnt->mnt_root, &name); + if (!leader) + goto out; -/** - * proc_pid_flush - recover memory used by stale /proc/@pid/x entries - * @proc_dentry: directoy to prune. - * - * Shrink the /proc directory that was used by the just killed thread. - */ - -void proc_pid_flush(struct dentry *proc_dentry) -{ - might_sleep(); - if(proc_dentry != NULL) { - shrink_dcache_parent(proc_dentry); - dput(proc_dentry); + name.name = "task"; + name.len = strlen(name.name); + dir = d_hash_and_lookup(leader, &name); + if (!dir) + goto out_put_leader; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->pid); + dentry = d_hash_and_lookup(dir, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); } + + dput(dir); +out_put_leader: + dput(leader); +out: + return; } /* SMP-safe */ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { + struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; struct inode *inode; struct proc_inode *ei; unsigned tgid; - int died; if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) { inode = new_inode(dir->i_sb); @@ -2029,21 +2019,18 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct if (tgid == ~0U) goto out; - read_lock(&tasklist_lock); + rcu_read_lock(); task = find_task_by_pid(tgid); if (task) get_task_struct(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!task) goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO); + if (!inode) + goto out_put_task; - - if (!inode) { - put_task_struct(task); - goto out; - } inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; @@ -2054,45 +2041,40 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct inode->i_nlink = 4; #endif - dentry->d_op = &pid_base_dentry_operations; + dentry->d_op = &pid_dentry_operations; - died = 0; d_add(dentry, inode); - spin_lock(&task->proc_lock); - task->proc_dentry = dentry; - if (!pid_alive(task)) { - dentry = proc_pid_unhash(task); - died = 1; - } - spin_unlock(&task->proc_lock); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + result = NULL; +out_put_task: put_task_struct(task); - if (died) { - proc_pid_flush(dentry); - goto out; - } - return NULL; out: - return ERR_PTR(-ENOENT); + return result; } /* SMP-safe */ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { + struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; - struct task_struct *leader = proc_task(dir); + struct task_struct *leader = get_proc_task(dir); struct inode *inode; unsigned tid; + if (!leader) + goto out_no_task; + tid = name_to_int(dentry); if (tid == ~0U) goto out; - read_lock(&tasklist_lock); + rcu_read_lock(); task = find_task_by_pid(tid); if (task) get_task_struct(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!task) goto out; if (leader->tgid != task->tgid) @@ -2113,101 +2095,95 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry inode->i_nlink = 3; #endif - dentry->d_op = &pid_base_dentry_operations; + dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + result = NULL; - put_task_struct(task); - return NULL; out_drop_task: put_task_struct(task); out: - return ERR_PTR(-ENOENT); + put_task_struct(leader); +out_no_task: + return result; } -#define PROC_NUMBUF 10 -#define PROC_MAXPIDS 20 - /* - * Get a few tgid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. + * Find the first tgid to return to user space. + * + * Usually this is just whatever follows &init_task, but if the users + * buffer was too small to hold the full list or there was a seek into + * the middle of the directory we have more work to do. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with &init_task and walk nr + * threads past it. */ -static int get_tgid_list(int index, unsigned long version, unsigned int *tgids) -{ - struct task_struct *p; - int nr_tgids = 0; - - index--; - read_lock(&tasklist_lock); - p = NULL; - if (version) { - p = find_task_by_pid(version); - if (p && !thread_group_leader(p)) - p = NULL; +static struct task_struct *first_tgid(int tgid, unsigned int nr) +{ + struct task_struct *pos; + rcu_read_lock(); + if (tgid && nr) { + pos = find_task_by_pid(tgid); + if (pos && thread_group_leader(pos)) + goto found; } + /* If nr exceeds the number of processes get out quickly */ + pos = NULL; + if (nr && nr >= nr_processes()) + goto done; - if (p) - index = 0; - else - p = next_task(&init_task); - - for ( ; p != &init_task; p = next_task(p)) { - int tgid = p->pid; - if (!pid_alive(p)) - continue; - if (--index >= 0) - continue; - tgids[nr_tgids] = tgid; - nr_tgids++; - if (nr_tgids >= PROC_MAXPIDS) - break; + /* If we haven't found our starting place yet start with + * the init_task and walk nr tasks forward. + */ + for (pos = next_task(&init_task); nr > 0; --nr) { + pos = next_task(pos); + if (pos == &init_task) { + pos = NULL; + goto done; + } } - read_unlock(&tasklist_lock); - return nr_tgids; +found: + get_task_struct(pos); +done: + rcu_read_unlock(); + return pos; } /* - * Get a few tid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. + * Find the next task in the task list. + * Return NULL if we loop or there is any error. + * + * The reference to the input task_struct is released. */ -static int get_tid_list(int index, unsigned int *tids, struct inode *dir) -{ - struct task_struct *leader_task = proc_task(dir); - struct task_struct *task = leader_task; - int nr_tids = 0; - - index -= 2; - read_lock(&tasklist_lock); - /* - * The starting point task (leader_task) might be an already - * unlinked task, which cannot be used to access the task-list - * via next_thread(). - */ - if (pid_alive(task)) do { - int tid = task->pid; - - if (--index >= 0) - continue; - if (tids != NULL) - tids[nr_tids] = tid; - nr_tids++; - if (nr_tids >= PROC_MAXPIDS) - break; - } while ((task = next_thread(task)) != leader_task); - read_unlock(&tasklist_lock); - return nr_tids; +static struct task_struct *next_tgid(struct task_struct *start) +{ + struct task_struct *pos; + rcu_read_lock(); + pos = start; + if (pid_alive(start)) + pos = next_task(start); + if (pid_alive(pos) && (pos != &init_task)) { + get_task_struct(pos); + goto done; + } + pos = NULL; +done: + rcu_read_unlock(); + put_task_struct(start); + return pos; } /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tgid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - unsigned int nr_tgids, i; - int next_tgid; + struct task_struct *task; + int tgid; if (!nr) { ino_t ino = fake_ino(0,PROC_TGID_INO); @@ -2216,63 +2192,116 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) filp->f_pos++; nr++; } + nr -= 1; /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - next_tgid = filp->f_version; + tgid = filp->f_version; filp->f_version = 0; - for (;;) { - nr_tgids = get_tgid_list(nr, next_tgid, tgid_array); - if (!nr_tgids) { - /* no more entries ! */ + for (task = first_tgid(tgid, nr); + task; + task = next_tgid(task), filp->f_pos++) { + int len; + ino_t ino; + tgid = task->pid; + len = snprintf(buf, sizeof(buf), "%d", tgid); + ino = fake_ino(tgid, PROC_TGID_INO); + if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + filp->f_version = tgid; + put_task_struct(task); break; } - next_tgid = 0; + } + return 0; +} - /* do not use the last found pid, reserve it for next_tgid */ - if (nr_tgids == PROC_MAXPIDS) { - nr_tgids--; - next_tgid = tgid_array[nr_tgids]; - } +/* + * Find the first tid of a thread group to return to user space. + * + * Usually this is just the thread group leader, but if the users + * buffer was too small or there was a seek into the middle of the + * directory we have more work todo. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with the leader and walk nr + * threads past it. + */ +static struct task_struct *first_tid(struct task_struct *leader, + int tid, int nr) +{ + struct task_struct *pos; - for (i=0;i<nr_tgids;i++) { - int tgid = tgid_array[i]; - ino_t ino = fake_ino(tgid,PROC_TGID_INO); - unsigned long j = PROC_NUMBUF; + rcu_read_lock(); + /* Attempt to start with the pid of a thread */ + if (tid && (nr > 0)) { + pos = find_task_by_pid(tid); + if (pos && (pos->group_leader == leader)) + goto found; + } - do - buf[--j] = '0' + (tgid % 10); - while ((tgid /= 10) != 0); + /* If nr exceeds the number of threads there is nothing todo */ + pos = NULL; + if (nr && nr >= get_nr_threads(leader)) + goto out; - if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) { - /* returning this tgid failed, save it as the first - * pid for the next readir call */ - filp->f_version = tgid_array[i]; - goto out; - } - filp->f_pos++; - nr++; + /* If we haven't found our starting place yet start + * with the leader and walk nr threads forward. + */ + for (pos = leader; nr > 0; --nr) { + pos = next_thread(pos); + if (pos == leader) { + pos = NULL; + goto out; } } +found: + get_task_struct(pos); out: - return 0; + rcu_read_unlock(); + return pos; +} + +/* + * Find the next thread in the thread list. + * Return NULL if there is an error or no next thread. + * + * The reference to the input task_struct is released. + */ +static struct task_struct *next_tid(struct task_struct *start) +{ + struct task_struct *pos = NULL; + rcu_read_lock(); + if (pid_alive(start)) { + pos = next_thread(start); + if (thread_group_leader(pos)) + pos = NULL; + else + get_task_struct(pos); + } + rcu_read_unlock(); + put_task_struct(start); + return pos; } /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; - unsigned int nr_tids, i; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; + struct task_struct *leader = get_proc_task(inode); + struct task_struct *task; int retval = -ENOENT; ino_t ino; + int tid; unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ - if (!pid_alive(proc_task(inode))) - goto out; + if (!leader) + goto out_no_task; retval = 0; switch (pos) { @@ -2290,24 +2319,45 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi /* fall through */ } - nr_tids = get_tid_list(pos, tid_array, inode); - inode->i_nlink = pos + nr_tids; - - for (i = 0; i < nr_tids; i++) { - unsigned long j = PROC_NUMBUF; - int tid = tid_array[i]; - - ino = fake_ino(tid,PROC_TID_INO); - - do - buf[--j] = '0' + (tid % 10); - while ((tid /= 10) != 0); - - if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0) + /* f_version caches the tgid value that the last readdir call couldn't + * return. lseek aka telldir automagically resets f_version to 0. + */ + tid = filp->f_version; + filp->f_version = 0; + for (task = first_tid(leader, tid, pos - 2); + task; + task = next_tid(task), pos++) { + int len; + tid = task->pid; + len = snprintf(buf, sizeof(buf), "%d", tid); + ino = fake_ino(tid, PROC_TID_INO); + if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + filp->f_version = tid; + put_task_struct(task); break; - pos++; + } } out: filp->f_pos = pos; + put_task_struct(leader); +out_no_task: return retval; } + +static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + generic_fillattr(inode, stat); + + if (p) { + rcu_read_lock(); + stat->nlink += get_nr_threads(p); + rcu_read_unlock(); + put_task_struct(p); + } + + return 0; +} diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 722b9c463111..6dcef089e18e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de) static void proc_delete_inode(struct inode *inode) { struct proc_dir_entry *de; - struct task_struct *tsk; truncate_inode_pages(&inode->i_data, 0); - /* Let go of any associated process */ - tsk = PROC_I(inode)->task; - if (tsk) - put_task_struct(tsk); + /* Stop tracking associated processes */ + put_pid(PROC_I(inode)->pid); /* Let go of any associated proc directory entry */ de = PROC_I(inode)->pde; @@ -94,8 +91,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL); if (!ei) return NULL; - ei->task = NULL; - ei->type = 0; + ei->pid = NULL; + ei->fd = 0; ei->op.proc_get_link = NULL; ei->pde = NULL; inode = &ei->vfs_inode; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0502f17b860d..146a434ba944 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -37,16 +37,30 @@ extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); +extern struct file_operations proc_maps_operations; +extern struct file_operations proc_numa_maps_operations; +extern struct file_operations proc_smaps_operations; + +extern struct file_operations proc_maps_operations; +extern struct file_operations proc_numa_maps_operations; +extern struct file_operations proc_smaps_operations; + + void free_proc_entry(struct proc_dir_entry *de); int proc_init_inodecache(void); -static inline struct task_struct *proc_task(struct inode *inode) +static inline struct pid *proc_pid(struct inode *inode) +{ + return PROC_I(inode)->pid; +} + +static inline struct task_struct *get_proc_task(struct inode *inode) { - return PROC_I(inode)->task; + return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -static inline int proc_type(struct inode *inode) +static inline int proc_fd(struct inode *inode) { - return PROC_I(inode)->type; + return PROC_I(inode)->fd; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 91b7c15ab373..0a163a4f7764 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount * { struct vm_area_struct * vma; int result = -ENOENT; - struct task_struct *task = proc_task(inode); - struct mm_struct * mm = get_task_mm(task); + struct task_struct *task = get_proc_task(inode); + struct mm_struct * mm = NULL; + if (task) { + mm = get_task_mm(task); + put_task_struct(task); + } if (!mm) goto out; down_read(&mm->mmap_sem); @@ -118,9 +122,15 @@ struct mem_size_stats unsigned long private_dirty; }; +__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; struct vm_area_struct *vma = v; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; @@ -153,22 +163,23 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats pad_len_spaces(m, len); seq_path(m, file->f_vfsmnt, file->f_dentry, "\n"); } else { - if (mm) { - if (vma->vm_start <= mm->start_brk && + const char *name = arch_vma_name(vma); + if (!name) { + if (mm) { + if (vma->vm_start <= mm->start_brk && vma->vm_end >= mm->brk) { - pad_len_spaces(m, len); - seq_puts(m, "[heap]"); - } else { - if (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack) { - - pad_len_spaces(m, len); - seq_puts(m, "[stack]"); + name = "[heap]"; + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + name = "[stack]"; } + } else { + name = "[vdso]"; } - } else { + } + if (name) { pad_len_spaces(m, len); - seq_puts(m, "[vdso]"); + seq_puts(m, name); } } seq_putc(m, '\n'); @@ -295,12 +306,16 @@ static int show_smap(struct seq_file *m, void *v) static void *m_start(struct seq_file *m, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; unsigned long last_addr = m->version; struct mm_struct *mm; - struct vm_area_struct *vma, *tail_vma; + struct vm_area_struct *vma, *tail_vma = NULL; loff_t l = *pos; + /* Clear the per syscall fields in priv */ + priv->task = NULL; + priv->tail_vma = NULL; + /* * We remember last_addr rather than next_addr to hit with * mmap_cache most of the time. We have zero last_addr at @@ -311,11 +326,15 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (last_addr == -1UL) return NULL; - mm = get_task_mm(task); + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return NULL; + + mm = get_task_mm(priv->task); if (!mm) return NULL; - tail_vma = get_gate_vma(task); + priv->tail_vma = tail_vma = get_gate_vma(priv->task); down_read(&mm->mmap_sem); /* Start with last addr hint */ @@ -350,11 +369,9 @@ out: return tail_vma; } -static void m_stop(struct seq_file *m, void *v) +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) { - struct task_struct *task = m->private; - struct vm_area_struct *vma = v; - if (vma && vma != get_gate_vma(task)) { + if (vma && vma != priv->tail_vma) { struct mm_struct *mm = vma->vm_mm; up_read(&mm->mmap_sem); mmput(mm); @@ -363,38 +380,103 @@ static void m_stop(struct seq_file *m, void *v) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - struct vm_area_struct *tail_vma = get_gate_vma(task); + struct vm_area_struct *tail_vma = priv->tail_vma; (*pos)++; if (vma && (vma != tail_vma) && vma->vm_next) return vma->vm_next; - m_stop(m, v); + vma_stop(priv, vma); return (vma != tail_vma)? tail_vma: NULL; } -struct seq_operations proc_pid_maps_op = { +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + + vma_stop(priv, vma); + if (priv->task) + put_task_struct(priv->task); +} + +static struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; -struct seq_operations proc_pid_smaps_op = { +static struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_smap }; +static int do_maps_open(struct inode *inode, struct file *file, + struct seq_operations *ops) +{ + struct proc_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} + +static int maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_maps_op); +} + +struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); -struct seq_operations proc_pid_numa_maps_op = { +static struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_numa_map }; + +static int numa_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_numa_maps_op); +} + +struct file_operations proc_numa_maps_operations = { + .open = numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; #endif + +static int smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_smaps_op); +} + +struct file_operations proc_smaps_operations = { + .open = smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 8f68827ed10e..af69f28277b6 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -156,9 +156,28 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos) { return NULL; } -struct seq_operations proc_pid_maps_op = { +static struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; + +static int maps_open(struct inode *inode, struct file *file) +{ + int ret; + ret = seq_open(file, &proc_pid_maps_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = NULL; + } + return ret; +} + +struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index cf6e1cf40351..752cea12e30f 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -1560,12 +1560,6 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t return res; } -static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf, - size_t count, loff_t pos) -{ - return generic_file_aio_write(iocb, buf, count, pos); -} - const struct file_operations reiserfs_file_operations = { .read = generic_file_read, .write = reiserfs_file_write, @@ -1575,7 +1569,7 @@ const struct file_operations reiserfs_file_operations = { .fsync = reiserfs_sync_file, .sendfile = generic_file_sendfile, .aio_read = generic_file_aio_read, - .aio_write = reiserfs_aio_write, + .aio_write = generic_file_aio_write, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 1b73529b8099..49d1a53dbef0 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -834,8 +834,7 @@ static int write_ordered_buffers(spinlock_t * lock, get_bh(bh); if (test_set_buffer_locked(bh)) { if (!buffer_dirty(bh)) { - list_del_init(&jh->list); - list_add(&jh->list, &tmp); + list_move(&jh->list, &tmp); goto loop_next; } spin_unlock(lock); @@ -855,8 +854,7 @@ static int write_ordered_buffers(spinlock_t * lock, ret = -EIO; } if (buffer_dirty(bh)) { - list_del_init(&jh->list); - list_add(&jh->list, &tmp); + list_move(&jh->list, &tmp); add_to_chunk(&chunk, bh, lock, write_ordered_chunk); } else { reiserfs_free_jh(bh); diff --git a/fs/select.c b/fs/select.c index 9c4f0f2604f1..33b72ba0f86f 100644 --- a/fs/select.c +++ b/fs/select.c @@ -746,9 +746,9 @@ out_fds: asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, long timeout_msecs) { - s64 timeout_jiffies = 0; + s64 timeout_jiffies; - if (timeout_msecs) { + if (timeout_msecs > 0) { #if HZ > 1000 /* We can only overflow if HZ > 1000 */ if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ) @@ -756,6 +756,9 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, else #endif timeout_jiffies = msecs_to_jiffies(timeout_msecs); + } else { + /* Infinite (< 0) or no (0) timeout */ + timeout_jiffies = timeout_msecs; } return do_sys_poll(ufds, nfds, &timeout_jiffies); diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c index c71dd2760d32..c8e96195b96e 100644 --- a/fs/smbfs/request.c +++ b/fs/smbfs/request.c @@ -400,8 +400,7 @@ static int smb_request_send_req(struct smb_request *req) if (!(req->rq_flags & SMB_REQ_TRANSMITTED)) goto out; - list_del_init(&req->rq_queue); - list_add_tail(&req->rq_queue, &server->recvq); + list_move_tail(&req->rq_queue, &server->recvq); result = 1; out: return result; @@ -435,8 +434,7 @@ int smb_request_send_server(struct smb_sb_info *server) result = smb_request_send_req(req); if (result < 0) { server->conn_error = result; - list_del_init(&req->rq_queue); - list_add(&req->rq_queue, &server->xmitq); + list_move(&req->rq_queue, &server->xmitq); result = -EIO; goto out; } diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c index 481a97a423fa..24577e2c489b 100644 --- a/fs/smbfs/smbiod.c +++ b/fs/smbfs/smbiod.c @@ -20,6 +20,7 @@ #include <linux/smp_lock.h> #include <linux/module.h> #include <linux/net.h> +#include <linux/kthread.h> #include <net/ip.h> #include <linux/smb_fs.h> @@ -40,7 +41,7 @@ enum smbiod_state { }; static enum smbiod_state smbiod_state = SMBIOD_DEAD; -static pid_t smbiod_pid; +static struct task_struct *smbiod_thread; static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait); static LIST_HEAD(smb_servers); static DEFINE_SPINLOCK(servers_lock); @@ -67,20 +68,29 @@ void smbiod_wake_up(void) */ static int smbiod_start(void) { - pid_t pid; + struct task_struct *tsk; + int err = 0; + if (smbiod_state != SMBIOD_DEAD) return 0; smbiod_state = SMBIOD_STARTING; __module_get(THIS_MODULE); spin_unlock(&servers_lock); - pid = kernel_thread(smbiod, NULL, 0); - if (pid < 0) + tsk = kthread_run(smbiod, NULL, "smbiod"); + if (IS_ERR(tsk)) { + err = PTR_ERR(tsk); module_put(THIS_MODULE); + } spin_lock(&servers_lock); - smbiod_state = pid < 0 ? SMBIOD_DEAD : SMBIOD_RUNNING; - smbiod_pid = pid; - return pid; + if (err < 0) { + smbiod_state = SMBIOD_DEAD; + smbiod_thread = NULL; + } else { + smbiod_state = SMBIOD_RUNNING; + smbiod_thread = tsk; + } + return err; } /* @@ -183,8 +193,7 @@ int smbiod_retry(struct smb_sb_info *server) if (req->rq_flags & SMB_REQ_RETRY) { /* must move the request to the xmitq */ VERBOSE("retrying request %p on recvq\n", req); - list_del(&req->rq_queue); - list_add(&req->rq_queue, &server->xmitq); + list_move(&req->rq_queue, &server->xmitq); continue; } #endif @@ -290,8 +299,6 @@ out: */ static int smbiod(void *unused) { - daemonize("smbiod"); - allow_signal(SIGKILL); VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid); diff --git a/fs/super.c b/fs/super.c index 057b5325b7ef..8a669f6f3f52 100644 --- a/fs/super.c +++ b/fs/super.c @@ -871,8 +871,6 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data) return mnt; } -EXPORT_SYMBOL_GPL(do_kern_mount); - struct vfsmount *kern_mount(struct file_system_type *type) { return vfs_kern_mount(type, 0, type->name, NULL); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 610b5bdbe75b..61c42430cba3 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -430,10 +430,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) i++; /* fallthrough */ default: - if (filp->f_pos == 2) { - list_del(q); - list_add(q, &parent_sd->s_children); - } + if (filp->f_pos == 2) + list_move(q, &parent_sd->s_children); + for (p=q->next; p!= &parent_sd->s_children; p=p->next) { struct sysfs_dirent *next; const char * name; @@ -455,8 +454,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) dt_type(next)) < 0) return 0; - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 3ada9dcf55b8..95b878e5c7a0 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -21,14 +21,6 @@ #include "swab.h" #include "util.h" -#undef UFS_BALLOC_DEBUG - -#ifdef UFS_BALLOC_DEBUG -#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x; -#else -#define UFSD(x) -#endif - static unsigned ufs_add_fragments (struct inode *, unsigned, unsigned, unsigned, int *); static unsigned ufs_alloc_fragments (struct inode *, unsigned, unsigned, unsigned, int *); static unsigned ufs_alloccg_block (struct inode *, struct ufs_cg_private_info *, unsigned, int *); @@ -39,7 +31,8 @@ static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *, /* * Free 'count' fragments from fragment number 'fragment' */ -void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count) { +void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count) +{ struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; @@ -51,7 +44,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); - UFSD(("ENTER, fragment %u, count %u\n", fragment, count)) + UFSD("ENTER, fragment %u, count %u\n", fragment, count); if (ufs_fragnum(fragment) + count > uspi->s_fpg) ufs_error (sb, "ufs_free_fragments", "internal error"); @@ -68,7 +61,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count ucpi = ufs_load_cylinder (sb, cgno); if (!ucpi) goto failed; - ucg = ubh_get_ucg (UCPI_UBH); + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); if (!ufs_cg_chkmagic(sb, ucg)) { ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno); goto failed; @@ -76,11 +69,11 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count end_bit = bit + count; bbase = ufs_blknum (bit); - blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase); + blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase); ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1); for (i = bit; i < end_bit; i++) { - if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, i)) - ubh_setbit (UCPI_UBH, ucpi->c_freeoff, i); + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, i)) + ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, i); else ufs_error (sb, "ufs_free_fragments", "bit already cleared for fragment %u", i); @@ -90,51 +83,52 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count fs32_add(sb, &ucg->cg_cs.cs_nffree, count); - fs32_add(sb, &usb1->fs_cstotal.cs_nffree, count); + uspi->cs_total.cs_nffree += count; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); - blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase); + blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase); ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1); /* * Trying to reassemble free fragments into block */ blkno = ufs_fragstoblks (bbase); - if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) { + if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) { fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb); - fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, uspi->s_fpb); + uspi->cs_total.cs_nffree -= uspi->s_fpb; fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); - fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1); + uspi->cs_total.cs_nbfree++; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); cylno = ufs_cbtocylno (bbase); fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(bbase)), 1); fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); } - ubh_mark_buffer_dirty (USPI_UBH); - ubh_mark_buffer_dirty (UCPI_UBH); + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi); - ubh_wait_on_buffer (UCPI_UBH); + ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); + ubh_wait_on_buffer (UCPI_UBH(ucpi)); } sb->s_dirt = 1; unlock_super (sb); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return; failed: unlock_super (sb); - UFSD(("EXIT (FAILED)\n")) + UFSD("EXIT (FAILED)\n"); return; } /* * Free 'count' fragments from fragment number 'fragment' (free whole blocks) */ -void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) { +void ufs_free_blocks(struct inode *inode, unsigned fragment, unsigned count) +{ struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; @@ -146,7 +140,7 @@ void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) { uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); - UFSD(("ENTER, fragment %u, count %u\n", fragment, count)) + UFSD("ENTER, fragment %u, count %u\n", fragment, count); if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) { ufs_error (sb, "ufs_free_blocks", "internal error, " @@ -162,7 +156,7 @@ do_more: bit = ufs_dtogd (fragment); if (cgno >= uspi->s_ncg) { ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device"); - goto failed; + goto failed_unlock; } end_bit = bit + count; if (end_bit > uspi->s_fpg) { @@ -173,36 +167,36 @@ do_more: ucpi = ufs_load_cylinder (sb, cgno); if (!ucpi) - goto failed; - ucg = ubh_get_ucg (UCPI_UBH); + goto failed_unlock; + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); if (!ufs_cg_chkmagic(sb, ucg)) { ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno); - goto failed; + goto failed_unlock; } for (i = bit; i < end_bit; i += uspi->s_fpb) { blkno = ufs_fragstoblks(i); - if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) { + if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) { ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); } - ubh_setblock(UCPI_UBH, ucpi->c_freeoff, blkno); + ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); DQUOT_FREE_BLOCK(inode, uspi->s_fpb); fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); - fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1); + uspi->cs_total.cs_nbfree++; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1); cylno = ufs_cbtocylno(i); fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(i)), 1); fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1); } - ubh_mark_buffer_dirty (USPI_UBH); - ubh_mark_buffer_dirty (UCPI_UBH); + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi); - ubh_wait_on_buffer (UCPI_UBH); + ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); + ubh_wait_on_buffer (UCPI_UBH(ucpi)); } if (overflow) { @@ -213,38 +207,127 @@ do_more: sb->s_dirt = 1; unlock_super (sb); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return; -failed: +failed_unlock: unlock_super (sb); - UFSD(("EXIT (FAILED)\n")) +failed: + UFSD("EXIT (FAILED)\n"); return; } +static struct page *ufs_get_locked_page(struct address_space *mapping, + unsigned long index) +{ + struct page *page; + +try_again: + page = find_lock_page(mapping, index); + if (!page) { + page = read_cache_page(mapping, index, + (filler_t*)mapping->a_ops->readpage, + NULL); + if (IS_ERR(page)) { + printk(KERN_ERR "ufs_change_blocknr: " + "read_cache_page error: ino %lu, index: %lu\n", + mapping->host->i_ino, index); + goto out; + } + lock_page(page); -#define NULLIFY_FRAGMENTS \ - for (i = oldcount; i < newcount; i++) { \ - bh = sb_getblk(sb, result + i); \ - memset (bh->b_data, 0, sb->s_blocksize); \ - set_buffer_uptodate(bh); \ - mark_buffer_dirty (bh); \ - if (IS_SYNC(inode)) \ - sync_dirty_buffer(bh); \ - brelse (bh); \ + if (!PageUptodate(page) || PageError(page)) { + unlock_page(page); + page_cache_release(page); + + printk(KERN_ERR "ufs_change_blocknr: " + "can not read page: ino %lu, index: %lu\n", + mapping->host->i_ino, index); + + page = ERR_PTR(-EIO); + goto out; + } } -unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment, - unsigned goal, unsigned count, int * err ) + if (unlikely(!page->mapping || !page_has_buffers(page))) { + unlock_page(page); + page_cache_release(page); + goto try_again;/*we really need these buffers*/ + } +out: + return page; +} + +/* + * Modify inode page cache in such way: + * have - blocks with b_blocknr equal to oldb...oldb+count-1 + * get - blocks with b_blocknr equal to newb...newb+count-1 + * also we suppose that oldb...oldb+count-1 blocks + * situated at the end of file. + * + * We can come here from ufs_writepage or ufs_prepare_write, + * locked_page is argument of these functions, so we already lock it. + */ +static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk, + unsigned int count, unsigned int oldb, + unsigned int newb, struct page *locked_page) +{ + unsigned int blk_per_page = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits); + struct address_space *mapping = inode->i_mapping; + pgoff_t index, cur_index = locked_page->index; + unsigned int i, j; + struct page *page; + struct buffer_head *head, *bh; + + UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n", + inode->i_ino, count, oldb, newb); + + BUG_ON(!PageLocked(locked_page)); + + for (i = 0; i < count; i += blk_per_page) { + index = (baseblk+i) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (likely(cur_index != index)) { + page = ufs_get_locked_page(mapping, index); + if (IS_ERR(page)) + continue; + } else + page = locked_page; + + j = i; + head = page_buffers(page); + bh = head; + do { + if (likely(bh->b_blocknr == j + oldb && j < count)) { + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + bh->b_blocknr = newb + j++; + mark_buffer_dirty(bh); + } + + bh = bh->b_this_page; + } while (bh != head); + + set_page_dirty(page); + + if (likely(cur_index != index)) { + unlock_page(page); + page_cache_release(page); + } + } + UFSD("EXIT\n"); +} + +unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment, + unsigned goal, unsigned count, int * err, struct page *locked_page) { struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; - struct buffer_head * bh; - unsigned cgno, oldcount, newcount, tmp, request, i, result; + unsigned cgno, oldcount, newcount, tmp, request, result; - UFSD(("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count)) + UFSD("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -273,14 +356,14 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment, return (unsigned)-1; } if (fragment < UFS_I(inode)->i_lastfrag) { - UFSD(("EXIT (ALREADY ALLOCATED)\n")) + UFSD("EXIT (ALREADY ALLOCATED)\n"); unlock_super (sb); return 0; } } else { if (tmp) { - UFSD(("EXIT (ALREADY ALLOCATED)\n")) + UFSD("EXIT (ALREADY ALLOCATED)\n"); unlock_super(sb); return 0; } @@ -289,9 +372,9 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment, /* * There is not enough space for user on the device */ - if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(usb1, UFS_MINFREE) <= 0) { + if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { unlock_super (sb); - UFSD(("EXIT (FAILED)\n")) + UFSD("EXIT (FAILED)\n"); return 0; } @@ -310,12 +393,10 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment, if (result) { *p = cpu_to_fs32(sb, result); *err = 0; - inode->i_blocks += count << uspi->s_nspfshift; UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); - NULLIFY_FRAGMENTS } unlock_super(sb); - UFSD(("EXIT, result %u\n", result)) + UFSD("EXIT, result %u\n", result); return result; } @@ -325,11 +406,9 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment, result = ufs_add_fragments (inode, tmp, oldcount, newcount, err); if (result) { *err = 0; - inode->i_blocks += count << uspi->s_nspfshift; UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); - NULLIFY_FRAGMENTS unlock_super(sb); - UFSD(("EXIT, result %u\n", result)) + UFSD("EXIT, result %u\n", result); return result; } @@ -339,8 +418,8 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment, switch (fs32_to_cpu(sb, usb1->fs_optim)) { case UFS_OPTSPACE: request = newcount; - if (uspi->s_minfree < 5 || fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) - > uspi->s_dsize * uspi->s_minfree / (2 * 100) ) + if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree + > uspi->s_dsize * uspi->s_minfree / (2 * 100)) break; usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); break; @@ -349,7 +428,7 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment, case UFS_OPTTIME: request = uspi->s_fpb; - if (fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) < uspi->s_dsize * + if (uspi->cs_total.cs_nffree < uspi->s_dsize * (uspi->s_minfree - 2) / 100) break; usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); @@ -357,39 +436,22 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment, } result = ufs_alloc_fragments (inode, cgno, goal, request, err); if (result) { - for (i = 0; i < oldcount; i++) { - bh = sb_bread(sb, tmp + i); - if(bh) - { - clear_buffer_dirty(bh); - bh->b_blocknr = result + i; - mark_buffer_dirty (bh); - if (IS_SYNC(inode)) - sync_dirty_buffer(bh); - brelse (bh); - } - else - { - printk(KERN_ERR "ufs_new_fragments: bread fail\n"); - unlock_super(sb); - return 0; - } - } + ufs_change_blocknr(inode, fragment - oldcount, oldcount, tmp, + result, locked_page); + *p = cpu_to_fs32(sb, result); *err = 0; - inode->i_blocks += count << uspi->s_nspfshift; UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count); - NULLIFY_FRAGMENTS unlock_super(sb); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); - UFSD(("EXIT, result %u\n", result)) + UFSD("EXIT, result %u\n", result); return result; } unlock_super(sb); - UFSD(("EXIT (FAILED)\n")) + UFSD("EXIT (FAILED)\n"); return 0; } @@ -404,7 +466,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment, struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; - UFSD(("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount)) + UFSD("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -419,7 +481,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment, ucpi = ufs_load_cylinder (sb, cgno); if (!ucpi) return 0; - ucg = ubh_get_ucg (UCPI_UBH); + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); if (!ufs_cg_chkmagic(sb, ucg)) { ufs_panic (sb, "ufs_add_fragments", "internal error, bad magic number on cg %u", cgno); @@ -429,14 +491,14 @@ ufs_add_fragments (struct inode * inode, unsigned fragment, fragno = ufs_dtogd (fragment); fragoff = ufs_fragnum (fragno); for (i = oldcount; i < newcount; i++) - if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i)) + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) return 0; /* * Block can be extended */ ucg->cg_time = cpu_to_fs32(sb, get_seconds()); for (i = newcount; i < (uspi->s_fpb - fragoff); i++) - if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i)) + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) break; fragsize = i - oldcount; if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize])) @@ -446,7 +508,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment, if (fragsize != count) fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); for (i = oldcount; i < newcount; i++) - ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, fragno + i); + ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); if(DQUOT_ALLOC_BLOCK(inode, count)) { *err = -EDQUOT; return 0; @@ -454,17 +516,17 @@ ufs_add_fragments (struct inode * inode, unsigned fragment, fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); - fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count); + uspi->cs_total.cs_nffree -= count; - ubh_mark_buffer_dirty (USPI_UBH); - ubh_mark_buffer_dirty (UCPI_UBH); + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi); - ubh_wait_on_buffer (UCPI_UBH); + ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); + ubh_wait_on_buffer (UCPI_UBH(ucpi)); } sb->s_dirt = 1; - UFSD(("EXIT, fragment %u\n", fragment)) + UFSD("EXIT, fragment %u\n", fragment); return fragment; } @@ -487,7 +549,7 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno, struct ufs_cylinder_group * ucg; unsigned oldcg, i, j, k, result, allocsize; - UFSD(("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count)) + UFSD("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -521,14 +583,14 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno, UFS_TEST_FREE_SPACE_CG } - UFSD(("EXIT (FAILED)\n")) + UFSD("EXIT (FAILED)\n"); return 0; cg_found: ucpi = ufs_load_cylinder (sb, cgno); if (!ucpi) return 0; - ucg = ubh_get_ucg (UCPI_UBH); + ucg = ubh_get_ucg (UCPI_UBH(ucpi)); if (!ufs_cg_chkmagic(sb, ucg)) ufs_panic (sb, "ufs_alloc_fragments", "internal error, bad magic number on cg %u", cgno); @@ -551,12 +613,12 @@ cg_found: return 0; goal = ufs_dtogd (result); for (i = count; i < uspi->s_fpb; i++) - ubh_setbit (UCPI_UBH, ucpi->c_freeoff, goal + i); + ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; DQUOT_FREE_BLOCK(inode, i); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); - fs32_add(sb, &usb1->fs_cstotal.cs_nffree, i); + uspi->cs_total.cs_nffree += i; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i); fs32_add(sb, &ucg->cg_frsum[i], 1); goto succed; @@ -570,10 +632,10 @@ cg_found: return 0; } for (i = 0; i < count; i++) - ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, result + i); + ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); - fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count); + uspi->cs_total.cs_nffree -= count; fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); fs32_sub(sb, &ucg->cg_frsum[allocsize], 1); @@ -581,16 +643,16 @@ cg_found: fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1); succed: - ubh_mark_buffer_dirty (USPI_UBH); - ubh_mark_buffer_dirty (UCPI_UBH); + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi); - ubh_wait_on_buffer (UCPI_UBH); + ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); + ubh_wait_on_buffer (UCPI_UBH(ucpi)); } sb->s_dirt = 1; result += cgno * uspi->s_fpg; - UFSD(("EXIT3, result %u\n", result)) + UFSD("EXIT3, result %u\n", result); return result; } @@ -603,12 +665,12 @@ static unsigned ufs_alloccg_block (struct inode * inode, struct ufs_cylinder_group * ucg; unsigned result, cylno, blkno; - UFSD(("ENTER, goal %u\n", goal)) + UFSD("ENTER, goal %u\n", goal); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); - ucg = ubh_get_ucg(UCPI_UBH); + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (goal == 0) { goal = ucpi->c_rotor; @@ -620,7 +682,7 @@ static unsigned ufs_alloccg_block (struct inode * inode, /* * If the requested block is available, use it. */ - if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, ufs_fragstoblks(goal))) { + if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) { result = goal; goto gotit; } @@ -632,7 +694,7 @@ norot: ucpi->c_rotor = result; gotit: blkno = ufs_fragstoblks(result); - ubh_clrblock (UCPI_UBH, ucpi->c_freeoff, blkno); + ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, -1); if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) { @@ -641,31 +703,76 @@ gotit: } fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); - fs32_sub(sb, &usb1->fs_cstotal.cs_nbfree, 1); + uspi->cs_total.cs_nbfree--; fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1); cylno = ufs_cbtocylno(result); fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(result)), 1); fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1); - UFSD(("EXIT, result %u\n", result)) + UFSD("EXIT, result %u\n", result); return result; } -static unsigned ufs_bitmap_search (struct super_block * sb, - struct ufs_cg_private_info * ucpi, unsigned goal, unsigned count) +static unsigned ubh_scanc(struct ufs_sb_private_info *uspi, + struct ufs_buffer_head *ubh, + unsigned begin, unsigned size, + unsigned char *table, unsigned char mask) { - struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; - struct ufs_cylinder_group * ucg; - unsigned start, length, location, result; - unsigned possition, fragsize, blockmap, mask; - - UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count)) + unsigned rest, offset; + unsigned char *cp; + + + offset = begin & ~uspi->s_fmask; + begin >>= uspi->s_fshift; + for (;;) { + if ((offset + size) < uspi->s_fsize) + rest = size; + else + rest = uspi->s_fsize - offset; + size -= rest; + cp = ubh->bh[begin]->b_data + offset; + while ((table[*cp++] & mask) == 0 && --rest) + ; + if (rest || !size) + break; + begin++; + offset = 0; + } + return (size + rest); +} + +/* + * Find a block of the specified size in the specified cylinder group. + * @sp: pointer to super block + * @ucpi: pointer to cylinder group info + * @goal: near which block we want find new one + * @count: specified size + */ +static unsigned ufs_bitmap_search(struct super_block *sb, + struct ufs_cg_private_info *ucpi, + unsigned goal, unsigned count) +{ + /* + * Bit patterns for identifying fragments in the block map + * used as ((map & mask_arr) == want_arr) + */ + static const int mask_arr[9] = { + 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff + }; + static const int want_arr[9] = { + 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe + }; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_super_block_first *usb1; + struct ufs_cylinder_group *ucg; + unsigned start, length, loc, result; + unsigned pos, want, blockmap, mask, end; + + UFSD("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count); - uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first (uspi); - ucg = ubh_get_ucg(UCPI_UBH); + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (goal) start = ufs_dtogd(goal) >> 3; @@ -673,53 +780,50 @@ static unsigned ufs_bitmap_search (struct super_block * sb, start = ucpi->c_frotor >> 3; length = ((uspi->s_fpg + 7) >> 3) - start; - location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff + start, length, + loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff + start, length, (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other, 1 << (count - 1 + (uspi->s_fpb & 7))); - if (location == 0) { + if (loc == 0) { length = start + 1; - location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff, length, - (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other, - 1 << (count - 1 + (uspi->s_fpb & 7))); - if (location == 0) { - ufs_error (sb, "ufs_bitmap_search", - "bitmap corrupted on cg %u, start %u, length %u, count %u, freeoff %u\n", - ucpi->c_cgx, start, length, count, ucpi->c_freeoff); + loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff, length, + (uspi->s_fpb == 8) ? ufs_fragtable_8fpb : + ufs_fragtable_other, + 1 << (count - 1 + (uspi->s_fpb & 7))); + if (loc == 0) { + ufs_error(sb, "ufs_bitmap_search", + "bitmap corrupted on cg %u, start %u," + " length %u, count %u, freeoff %u\n", + ucpi->c_cgx, start, length, count, + ucpi->c_freeoff); return (unsigned)-1; } start = 0; } - result = (start + length - location) << 3; + result = (start + length - loc) << 3; ucpi->c_frotor = result; /* * found the byte in the map */ - blockmap = ubh_blkmap(UCPI_UBH, ucpi->c_freeoff, result); - fragsize = 0; - for (possition = 0, mask = 1; possition < 8; possition++, mask <<= 1) { - if (blockmap & mask) { - if (!(possition & uspi->s_fpbmask)) - fragsize = 1; - else - fragsize++; - } - else { - if (fragsize == count) { - result += possition - count; - UFSD(("EXIT, result %u\n", result)) - return result; - } - fragsize = 0; - } - } - if (fragsize == count) { - result += possition - count; - UFSD(("EXIT, result %u\n", result)) - return result; - } - ufs_error (sb, "ufs_bitmap_search", "block not in map on cg %u\n", ucpi->c_cgx); - UFSD(("EXIT (FAILED)\n")) + + for (end = result + 8; result < end; result += uspi->s_fpb) { + blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result); + blockmap <<= 1; + mask = mask_arr[count]; + want = want_arr[count]; + for (pos = 0; pos <= uspi->s_fpb - count; pos++) { + if ((blockmap & mask) == want) { + UFSD("EXIT, result %u\n", result); + return result + pos; + } + mask <<= 1; + want <<= 1; + } + } + + ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n", + ucpi->c_cgx); + UFSD("EXIT (FAILED)\n"); return (unsigned)-1; } @@ -734,9 +838,9 @@ static void ufs_clusteracct(struct super_block * sb, return; if (cnt > 0) - ubh_setbit(UCPI_UBH, ucpi->c_clusteroff, blkno); + ubh_setbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno); else - ubh_clrbit(UCPI_UBH, ucpi->c_clusteroff, blkno); + ubh_clrbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno); /* * Find the size of the cluster going forward. @@ -745,7 +849,7 @@ static void ufs_clusteracct(struct super_block * sb, end = start + uspi->s_contigsumsize; if ( end >= ucpi->c_nclusterblks) end = ucpi->c_nclusterblks; - i = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_clusteroff, end, start); + i = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, end, start); if (i > end) i = end; forw = i - start; @@ -757,7 +861,7 @@ static void ufs_clusteracct(struct super_block * sb, end = start - uspi->s_contigsumsize; if (end < 0 ) end = -1; - i = ubh_find_last_zero_bit (UCPI_UBH, ucpi->c_clusteroff, start, end); + i = ubh_find_last_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, start, end); if ( i < end) i = end; back = start - i; @@ -769,11 +873,11 @@ static void ufs_clusteracct(struct super_block * sb, i = back + forw + 1; if (i > uspi->s_contigsumsize) i = uspi->s_contigsumsize; - fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (i << 2)), cnt); + fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (i << 2)), cnt); if (back > 0) - fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (back << 2)), cnt); + fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (back << 2)), cnt); if (forw > 0) - fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (forw << 2)), cnt); + fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (forw << 2)), cnt); } diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c index 14abb8b835f7..09c39e5e6386 100644 --- a/fs/ufs/cylinder.c +++ b/fs/ufs/cylinder.c @@ -20,15 +20,6 @@ #include "swab.h" #include "util.h" -#undef UFS_CYLINDER_DEBUG - -#ifdef UFS_CYLINDER_DEBUG -#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x; -#else -#define UFSD(x) -#endif - - /* * Read cylinder group into cache. The memory space for ufs_cg_private_info * structure is already allocated during ufs_read_super. @@ -42,19 +33,19 @@ static void ufs_read_cylinder (struct super_block * sb, struct ufs_cylinder_group * ucg; unsigned i, j; - UFSD(("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr)) + UFSD("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr); uspi = sbi->s_uspi; ucpi = sbi->s_ucpi[bitmap_nr]; ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data; - UCPI_UBH->fragment = ufs_cgcmin(cgno); - UCPI_UBH->count = uspi->s_cgsize >> sb->s_blocksize_bits; + UCPI_UBH(ucpi)->fragment = ufs_cgcmin(cgno); + UCPI_UBH(ucpi)->count = uspi->s_cgsize >> sb->s_blocksize_bits; /* * We have already the first fragment of cylinder group block in buffer */ - UCPI_UBH->bh[0] = sbi->s_ucg[cgno]; - for (i = 1; i < UCPI_UBH->count; i++) - if (!(UCPI_UBH->bh[i] = sb_bread(sb, UCPI_UBH->fragment + i))) + UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno]; + for (i = 1; i < UCPI_UBH(ucpi)->count; i++) + if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i))) goto failed; sbi->s_cgno[bitmap_nr] = cgno; @@ -73,7 +64,7 @@ static void ufs_read_cylinder (struct super_block * sb, ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff); ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff); ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return; failed: @@ -95,15 +86,15 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr) struct ufs_cylinder_group * ucg; unsigned i; - UFSD(("ENTER, bitmap_nr %u\n", bitmap_nr)) + UFSD("ENTER, bitmap_nr %u\n", bitmap_nr); uspi = sbi->s_uspi; if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) { - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return; } ucpi = sbi->s_ucpi[bitmap_nr]; - ucg = ubh_get_ucg(UCPI_UBH); + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) { ufs_panic (sb, "ufs_put_cylinder", "internal error"); @@ -116,13 +107,13 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr) ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor); ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor); ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor); - ubh_mark_buffer_dirty (UCPI_UBH); - for (i = 1; i < UCPI_UBH->count; i++) { - brelse (UCPI_UBH->bh[i]); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); + for (i = 1; i < UCPI_UBH(ucpi)->count; i++) { + brelse (UCPI_UBH(ucpi)->bh[i]); } sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY; - UFSD(("EXIT\n")) + UFSD("EXIT\n"); } /* @@ -139,7 +130,7 @@ struct ufs_cg_private_info * ufs_load_cylinder ( struct ufs_cg_private_info * ucpi; unsigned cg, i, j; - UFSD(("ENTER, cgno %u\n", cgno)) + UFSD("ENTER, cgno %u\n", cgno); uspi = sbi->s_uspi; if (cgno >= uspi->s_ncg) { @@ -150,7 +141,7 @@ struct ufs_cg_private_info * ufs_load_cylinder ( * Cylinder group number cg it in cache and it was last used */ if (sbi->s_cgno[0] == cgno) { - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return sbi->s_ucpi[0]; } /* @@ -160,16 +151,16 @@ struct ufs_cg_private_info * ufs_load_cylinder ( if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) { if (sbi->s_cgno[cgno] != cgno) { ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache"); - UFSD(("EXIT (FAILED)\n")) + UFSD("EXIT (FAILED)\n"); return NULL; } else { - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return sbi->s_ucpi[cgno]; } } else { ufs_read_cylinder (sb, cgno, cgno); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return sbi->s_ucpi[cgno]; } } @@ -204,6 +195,6 @@ struct ufs_cg_private_info * ufs_load_cylinder ( sbi->s_ucpi[0] = ucpi; ufs_read_cylinder (sb, cgno, 0); } - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return sbi->s_ucpi[0]; } diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 1a561202d3f4..7f0a0aa63584 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -11,31 +11,20 @@ * 4.4BSD (FreeBSD) support added on February 1st 1998 by * Niels Kristian Bech Jensen <nkbj@image.dk> partially based * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>. + * + * Migration to usage of "page cache" on May 2006 by + * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base. */ #include <linux/time.h> #include <linux/fs.h> #include <linux/ufs_fs.h> #include <linux/smp_lock.h> -#include <linux/buffer_head.h> #include <linux/sched.h> #include "swab.h" #include "util.h" -#undef UFS_DIR_DEBUG - -#ifdef UFS_DIR_DEBUG -#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x; -#else -#define UFSD(x) -#endif - -static int -ufs_check_dir_entry (const char *, struct inode *, struct ufs_dir_entry *, - struct buffer_head *, unsigned long); - - /* * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure. * @@ -51,495 +40,541 @@ static inline int ufs_match(struct super_block *sb, int len, return !memcmp(name, de->d_name, len); } -/* - * This is blatantly stolen from ext2fs - */ -static int -ufs_readdir (struct file * filp, void * dirent, filldir_t filldir) +static int ufs_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *inode = filp->f_dentry->d_inode; - int error = 0; - unsigned long offset, lblk; - int i, stored; - struct buffer_head * bh; - struct ufs_dir_entry * de; - struct super_block * sb; - int de_reclen; - unsigned flags; - u64 blk= 0L; - - lock_kernel(); - - sb = inode->i_sb; - flags = UFS_SB(sb)->s_flags; - - UFSD(("ENTER, ino %lu f_pos %lu\n", inode->i_ino, (unsigned long) filp->f_pos)) - - stored = 0; - bh = NULL; - offset = filp->f_pos & (sb->s_blocksize - 1); - - while (!error && !stored && filp->f_pos < inode->i_size) { - lblk = (filp->f_pos) >> sb->s_blocksize_bits; - blk = ufs_frag_map(inode, lblk); - if (!blk || !(bh = sb_bread(sb, blk))) { - /* XXX - error - skip to the next block */ - printk("ufs_readdir: " - "dir inode %lu has a hole at offset %lu\n", - inode->i_ino, (unsigned long int)filp->f_pos); - filp->f_pos += sb->s_blocksize - offset; - continue; - } - -revalidate: - /* If the dir block has changed since the last call to - * readdir(2), then we might be pointing to an invalid - * dirent right now. Scan from the start of the block - * to make sure. */ - if (filp->f_version != inode->i_version) { - for (i = 0; i < sb->s_blocksize && i < offset; ) { - de = (struct ufs_dir_entry *)(bh->b_data + i); - /* It's too expensive to do a full - * dirent test each time round this - * loop, but we do have to test at - * least that it is non-zero. A - * failure will be detected in the - * dirent test below. */ - de_reclen = fs16_to_cpu(sb, de->d_reclen); - if (de_reclen < 1) - break; - i += de_reclen; - } - offset = i; - filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) - | offset; - filp->f_version = inode->i_version; - } + struct inode *dir = page->mapping->host; + int err = 0; + dir->i_version++; + page->mapping->a_ops->commit_write(NULL, page, from, to); + if (IS_DIRSYNC(dir)) + err = write_one_page(page, 1); + else + unlock_page(page); + return err; +} - while (!error && filp->f_pos < inode->i_size - && offset < sb->s_blocksize) { - de = (struct ufs_dir_entry *) (bh->b_data + offset); - /* XXX - put in a real ufs_check_dir_entry() */ - if ((de->d_reclen == 0) || (ufs_get_de_namlen(sb, de) == 0)) { - filp->f_pos = (filp->f_pos & - (sb->s_blocksize - 1)) + - sb->s_blocksize; - brelse(bh); - unlock_kernel(); - return stored; - } - if (!ufs_check_dir_entry ("ufs_readdir", inode, de, - bh, offset)) { - /* On error, skip the f_pos to the - next block. */ - filp->f_pos = (filp->f_pos | - (sb->s_blocksize - 1)) + - 1; - brelse (bh); - unlock_kernel(); - return stored; - } - offset += fs16_to_cpu(sb, de->d_reclen); - if (de->d_ino) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. */ - unsigned long version = filp->f_version; - unsigned char d_type = DT_UNKNOWN; +static inline void ufs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} - UFSD(("filldir(%s,%u)\n", de->d_name, - fs32_to_cpu(sb, de->d_ino))) - UFSD(("namlen %u\n", ufs_get_de_namlen(sb, de))) +static inline unsigned long ufs_dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} - if ((flags & UFS_DE_MASK) == UFS_DE_44BSD) - d_type = de->d_u.d_44.d_type; - error = filldir(dirent, de->d_name, - ufs_get_de_namlen(sb, de), filp->f_pos, - fs32_to_cpu(sb, de->d_ino), d_type); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored ++; - } - filp->f_pos += fs16_to_cpu(sb, de->d_reclen); - } - offset = 0; - brelse (bh); +ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry) +{ + ino_t res = 0; + struct ufs_dir_entry *de; + struct page *page; + + de = ufs_find_entry(dir, dentry, &page); + if (de) { + res = fs32_to_cpu(dir->i_sb, de->d_ino); + ufs_put_page(page); } - unlock_kernel(); - return 0; + return res; } -/* - * define how far ahead to read directories while searching them. - */ -#define NAMEI_RA_CHUNKS 2 -#define NAMEI_RA_BLOCKS 4 -#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) -/* - * ufs_find_entry() - * - * finds an entry in the specified directory with the wanted name. It - * returns the cache buffer in which the entry was found, and the entry - * itself (as a parameter - res_bh). It does NOT read the inode of the - * entry - you'll have to do that yourself if you want to. - */ -struct ufs_dir_entry * ufs_find_entry (struct dentry *dentry, - struct buffer_head ** res_bh) +/* Releases the page */ +void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct page *page, struct inode *inode) { - struct super_block * sb; - struct buffer_head * bh_use[NAMEI_RA_SIZE]; - struct buffer_head * bh_read[NAMEI_RA_SIZE]; - unsigned long offset; - int block, toread, i, err; - struct inode *dir = dentry->d_parent->d_inode; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + unsigned from = (char *) de - (char *) page_address(page); + unsigned to = from + fs16_to_cpu(dir->i_sb, de->d_reclen); + int err; - UFSD(("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen)) - - *res_bh = NULL; - - sb = dir->i_sb; - - if (namelen > UFS_MAXNAMLEN) - return NULL; + lock_page(page); + err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + BUG_ON(err); + de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); + ufs_set_de_type(dir->i_sb, de, inode->i_mode); + err = ufs_commit_chunk(page, from, to); + ufs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dir); +} - memset (bh_use, 0, sizeof (bh_use)); - toread = 0; - for (block = 0; block < NAMEI_RA_SIZE; ++block) { - struct buffer_head * bh; - if ((block << sb->s_blocksize_bits) >= dir->i_size) - break; - bh = ufs_getfrag (dir, block, 0, &err); - bh_use[block] = bh; - if (bh && !buffer_uptodate(bh)) - bh_read[toread++] = bh; +static void ufs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + struct super_block *sb = dir->i_sb; + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + struct ufs_dir_entry *p; + char *error; + + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (UFS_SECTOR_SIZE - 1)) + goto Ebadsize; + if (!limit) + goto out; } + for (offs = 0; offs <= limit - UFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct ufs_dir_entry *)(kaddr + offs); + rec_len = fs16_to_cpu(sb, p->d_reclen); + + if (rec_len < UFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p))) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~(UFS_SECTOR_SIZE-1)) + goto Espan; + if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg * + UFS_SB(sb)->s_uspi->s_ncg)) + goto Einumber; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + + /* Too bad, we had an error */ + +Ebadsize: + ufs_error(sb, "ufs_check_page", + "size of directory #%lu is not a multiple of chunk size", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; + goto bad_entry; +Einumber: + error = "inode out of bounds"; +bad_entry: + ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - " + "offset=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + rec_len, ufs_get_de_namlen(sb, p)); + goto fail; +Eend: + p = (struct ufs_dir_entry *)(kaddr + offs); + ufs_error (sb, "ext2_check_page", + "entry in directory #%lu spans the page boundary" + "offset=%lu", + dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs); +fail: + SetPageChecked(page); + SetPageError(page); +} - for (block = 0, offset = 0; offset < dir->i_size; block++) { - struct buffer_head * bh; - struct ufs_dir_entry * de; - char * dlimit; - - if ((block % NAMEI_RA_BLOCKS) == 0 && toread) { - ll_rw_block (READ, toread, bh_read); - toread = 0; - } - bh = bh_use[block % NAMEI_RA_SIZE]; - if (!bh) { - ufs_error (sb, "ufs_find_entry", - "directory #%lu contains a hole at offset %lu", - dir->i_ino, offset); - offset += sb->s_blocksize; - continue; - } - wait_on_buffer (bh); - if (!buffer_uptodate(bh)) { - /* - * read error: all bets are off - */ - break; - } - - de = (struct ufs_dir_entry *) bh->b_data; - dlimit = bh->b_data + sb->s_blocksize; - while ((char *) de < dlimit && offset < dir->i_size) { - /* this code is executed quadratically often */ - /* do minimal checking by hand */ - int de_len; - - if ((char *) de + namelen <= dlimit && - ufs_match(sb, namelen, name, de)) { - /* found a match - - just to be sure, do a full check */ - if (!ufs_check_dir_entry("ufs_find_entry", - dir, de, bh, offset)) - goto failed; - for (i = 0; i < NAMEI_RA_SIZE; ++i) { - if (bh_use[i] != bh) - brelse (bh_use[i]); - } - *res_bh = bh; - return de; - } - /* prevent looping on a bad block */ - de_len = fs16_to_cpu(sb, de->d_reclen); - if (de_len <= 0) - goto failed; - offset += de_len; - de = (struct ufs_dir_entry *) ((char *) de + de_len); - } - - brelse (bh); - if (((block + NAMEI_RA_SIZE) << sb->s_blocksize_bits ) >= - dir->i_size) - bh = NULL; - else - bh = ufs_getfrag (dir, block + NAMEI_RA_SIZE, 0, &err); - bh_use[block % NAMEI_RA_SIZE] = bh; - if (bh && !buffer_uptodate(bh)) - bh_read[toread++] = bh; +static struct page *ufs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_cache_page(mapping, n, + (filler_t*)mapping->a_ops->readpage, NULL); + if (!IS_ERR(page)) { + wait_on_page_locked(page); + kmap(page); + if (!PageUptodate(page)) + goto fail; + if (!PageChecked(page)) + ufs_check_page(page); + if (PageError(page)) + goto fail; } + return page; -failed: - for (i = 0; i < NAMEI_RA_SIZE; ++i) brelse (bh_use[i]); - UFSD(("EXIT\n")) - return NULL; +fail: + ufs_put_page(page); + return ERR_PTR(-EIO); } -static int -ufs_check_dir_entry (const char *function, struct inode *dir, - struct ufs_dir_entry *de, struct buffer_head *bh, - unsigned long offset) +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned +ufs_last_byte(struct inode *inode, unsigned long page_nr) { - struct super_block *sb = dir->i_sb; - const char *error_msg = NULL; - int rlen = fs16_to_cpu(sb, de->d_reclen); - - if (rlen < UFS_DIR_REC_LEN(1)) - error_msg = "reclen is smaller than minimal"; - else if (rlen % 4 != 0) - error_msg = "reclen % 4 != 0"; - else if (rlen < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de))) - error_msg = "reclen is too small for namlen"; - else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) - error_msg = "directory entry across blocks"; - else if (fs32_to_cpu(sb, de->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg * - UFS_SB(sb)->s_uspi->s_ncg)) - error_msg = "inode out of bounds"; - - if (error_msg != NULL) - ufs_error (sb, function, "bad entry in directory #%lu, size %Lu: %s - " - "offset=%lu, inode=%lu, reclen=%d, namlen=%d", - dir->i_ino, dir->i_size, error_msg, offset, - (unsigned long)fs32_to_cpu(sb, de->d_ino), - rlen, ufs_get_de_namlen(sb, de)); - - return (error_msg == NULL ? 1 : 0); + unsigned last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; } -struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct buffer_head **p) +static inline struct ufs_dir_entry * +ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p) { - int err; - struct buffer_head *bh = ufs_bread (dir, 0, 0, &err); - struct ufs_dir_entry *res = NULL; - - if (bh) { - res = (struct ufs_dir_entry *) bh->b_data; - res = (struct ufs_dir_entry *)((char *)res + - fs16_to_cpu(dir->i_sb, res->d_reclen)); - } - *p = bh; - return res; + return (struct ufs_dir_entry *)((char *)p + + fs16_to_cpu(sb, p->d_reclen)); } -ino_t ufs_inode_by_name(struct inode * dir, struct dentry *dentry) + +struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) { - ino_t res = 0; - struct ufs_dir_entry * de; - struct buffer_head *bh; + struct page *page = ufs_get_page(dir, 0); + struct ufs_dir_entry *de = NULL; - de = ufs_find_entry (dentry, &bh); - if (de) { - res = fs32_to_cpu(dir->i_sb, de->d_ino); - brelse(bh); + if (!IS_ERR(page)) { + de = ufs_next_entry(dir->i_sb, + (struct ufs_dir_entry *)page_address(page)); + *p = page; } - return res; + return de; } -void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct buffer_head *bh, struct inode *inode) +/* + * ufs_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the page in which the entry was found, and the entry itself + * (as a parameter - res_dir). Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry, + struct page **res_page) { - dir->i_version++; - de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); - mark_buffer_dirty(bh); - if (IS_DIRSYNC(dir)) - sync_dirty_buffer(bh); - brelse (bh); + struct super_block *sb = dir->i_sb; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned reclen = UFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = ufs_dir_pages(dir); + struct page *page = NULL; + struct ufs_inode_info *ui = UFS_I(dir); + struct ufs_dir_entry *de; + + UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen); + + if (npages == 0 || namelen > UFS_MAXNAMLEN) + goto out; + + /* OFFSET_CACHE */ + *res_page = NULL; + + start = ui->i_dir_start_lookup; + + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = ufs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct ufs_dir_entry *) kaddr; + kaddr += ufs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->d_reclen == 0) { + ufs_error(dir->i_sb, __FUNCTION__, + "zero-length directory entry"); + ufs_put_page(page); + goto out; + } + if (ufs_match(sb, namelen, name, de)) + goto found; + de = ufs_next_entry(sb, de); + } + ufs_put_page(page); + } + if (++n >= npages) + n = 0; + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + ui->i_dir_start_lookup = n; + return de; } /* - * ufs_add_entry() - * - * adds a file entry to the specified directory, using the same - * semantics as ufs_find_entry(). It returns NULL if it failed. + * Parent is locked. */ int ufs_add_link(struct dentry *dentry, struct inode *inode) { - struct super_block * sb; - struct ufs_sb_private_info * uspi; - unsigned long offset; - unsigned fragoff; - unsigned short rec_len; - struct buffer_head * bh; - struct ufs_dir_entry * de, * de1; struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; + struct super_block *sb = dir->i_sb; + unsigned reclen = UFS_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + struct ufs_dir_entry *de; + unsigned long npages = ufs_dir_pages(dir); + unsigned long n; + char *kaddr; + unsigned from, to; int err; - UFSD(("ENTER, name %s, namelen %u\n", name, namelen)) - - sb = dir->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - if (!namelen) - return -EINVAL; - bh = ufs_bread (dir, 0, 0, &err); - if (!bh) - return err; - rec_len = UFS_DIR_REC_LEN(namelen); - offset = 0; - de = (struct ufs_dir_entry *) bh->b_data; - while (1) { - if ((char *)de >= UFS_SECTOR_SIZE + bh->b_data) { - fragoff = offset & ~uspi->s_fmask; - if (fragoff != 0 && fragoff != UFS_SECTOR_SIZE) - ufs_error (sb, "ufs_add_entry", "internal error" - " fragoff %u", fragoff); - if (!fragoff) { - brelse (bh); - bh = ufs_bread (dir, offset >> sb->s_blocksize_bits, 1, &err); - if (!bh) - return err; - } - if (dir->i_size <= offset) { - if (dir->i_size == 0) { - brelse(bh); - return -ENOENT; - } - de = (struct ufs_dir_entry *) (bh->b_data + fragoff); - de->d_ino = 0; + UFSD("ENTER, name %s, namelen %u\n", name, namelen); + + /* + * We take care of directory expansion in the same loop. + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = ufs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + ufs_last_byte(dir, n); + de = (struct ufs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + /* We hit i_size */ + name_len = 0; + rec_len = UFS_SECTOR_SIZE; de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE); - ufs_set_de_namlen(sb, de, 0); - dir->i_size = offset + UFS_SECTOR_SIZE; - mark_inode_dirty(dir); - } else { - de = (struct ufs_dir_entry *) bh->b_data; + de->d_ino = 0; + goto got_it; } + if (de->d_reclen == 0) { + ufs_error(dir->i_sb, __FUNCTION__, + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (ufs_match(sb, namelen, name, de)) + goto out_unlock; + name_len = UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)); + rec_len = fs16_to_cpu(sb, de->d_reclen); + if (!de->d_ino && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct ufs_dir_entry *) ((char *) de + rec_len); } - if (!ufs_check_dir_entry ("ufs_add_entry", dir, de, bh, offset)) { - brelse (bh); - return -ENOENT; - } - if (ufs_match(sb, namelen, name, de)) { - brelse (bh); - return -EEXIST; - } - if (de->d_ino == 0 && fs16_to_cpu(sb, de->d_reclen) >= rec_len) - break; - - if (fs16_to_cpu(sb, de->d_reclen) >= - UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)) + rec_len) - break; - offset += fs16_to_cpu(sb, de->d_reclen); - de = (struct ufs_dir_entry *) ((char *) de + fs16_to_cpu(sb, de->d_reclen)); + unlock_page(page); + ufs_put_page(page); } - + BUG(); + return -EINVAL; + +got_it: + from = (char*)de - (char*)page_address(page); + to = from + rec_len; + err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + if (err) + goto out_unlock; if (de->d_ino) { - de1 = (struct ufs_dir_entry *) ((char *) de + - UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de))); - de1->d_reclen = - cpu_to_fs16(sb, fs16_to_cpu(sb, de->d_reclen) - - UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de))); - de->d_reclen = - cpu_to_fs16(sb, UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de))); + struct ufs_dir_entry *de1 = + (struct ufs_dir_entry *) ((char *) de + name_len); + de1->d_reclen = cpu_to_fs16(sb, rec_len - name_len); + de->d_reclen = cpu_to_fs16(sb, name_len); + de = de1; } - de->d_ino = 0; + ufs_set_de_namlen(sb, de, namelen); - memcpy (de->d_name, name, namelen + 1); + memcpy(de->d_name, name, namelen + 1); de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); - mark_buffer_dirty(bh); - if (IS_DIRSYNC(dir)) - sync_dirty_buffer(bh); - brelse (bh); + + err = ufs_commit_chunk(page, from, to); dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; - dir->i_version++; + mark_inode_dirty(dir); + /* OFFSET_CACHE */ +out_put: + ufs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} - UFSD(("EXIT\n")) +static inline unsigned +ufs_validate_entry(struct super_block *sb, char *base, + unsigned offset, unsigned mask) +{ + struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset); + struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask)); + while ((char*)p < (char*)de) { + if (p->d_reclen == 0) + break; + p = ufs_next_entry(sb, p); + } + return (char *)p - base; +} + + +/* + * This is blatantly stolen from ext2fs + */ +static int +ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + loff_t pos = filp->f_pos; + struct inode *inode = filp->f_dentry->d_inode; + struct super_block *sb = inode->i_sb; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = ufs_dir_pages(inode); + unsigned chunk_mask = ~(UFS_SECTOR_SIZE - 1); + int need_revalidate = filp->f_version != inode->i_version; + unsigned flags = UFS_SB(sb)->s_flags; + + UFSD("BEGIN\n"); + + if (pos > inode->i_size - UFS_DIR_REC_LEN(1)) + return 0; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct ufs_dir_entry *de; + + struct page *page = ufs_get_page(inode, n); + + if (IS_ERR(page)) { + ufs_error(sb, __FUNCTION__, + "bad page in #%lu", + inode->i_ino); + filp->f_pos += PAGE_CACHE_SIZE - offset; + return -EIO; + } + kaddr = page_address(page); + if (unlikely(need_revalidate)) { + if (offset) { + offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); + filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + } + filp->f_version = inode->i_version; + need_revalidate = 0; + } + de = (struct ufs_dir_entry *)(kaddr+offset); + limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1); + for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) { + if (de->d_reclen == 0) { + ufs_error(sb, __FUNCTION__, + "zero-length directory entry"); + ufs_put_page(page); + return -EIO; + } + if (de->d_ino) { + int over; + unsigned char d_type = DT_UNKNOWN; + + offset = (char *)de - kaddr; + + UFSD("filldir(%s,%u)\n", de->d_name, + fs32_to_cpu(sb, de->d_ino)); + UFSD("namlen %u\n", ufs_get_de_namlen(sb, de)); + + if ((flags & UFS_DE_MASK) == UFS_DE_44BSD) + d_type = de->d_u.d_44.d_type; + + over = filldir(dirent, de->d_name, + ufs_get_de_namlen(sb, de), + (n<<PAGE_CACHE_SHIFT) | offset, + fs32_to_cpu(sb, de->d_ino), d_type); + if (over) { + ufs_put_page(page); + return 0; + } + } + filp->f_pos += fs16_to_cpu(sb, de->d_reclen); + } + ufs_put_page(page); + } return 0; } + /* * ufs_delete_entry deletes a directory entry by merging it with the * previous entry. */ -int ufs_delete_entry (struct inode * inode, struct ufs_dir_entry * dir, - struct buffer_head * bh ) - +int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, + struct page * page) { - struct super_block * sb; - struct ufs_dir_entry * de, * pde; - unsigned i; - - UFSD(("ENTER\n")) + struct super_block *sb = inode->i_sb; + struct address_space *mapping = page->mapping; + char *kaddr = page_address(page); + unsigned from = ((char*)dir - kaddr) & ~(UFS_SECTOR_SIZE - 1); + unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen); + struct ufs_dir_entry *pde = NULL; + struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from); + int err; - sb = inode->i_sb; - i = 0; - pde = NULL; - de = (struct ufs_dir_entry *) bh->b_data; - - UFSD(("ino %u, reclen %u, namlen %u, name %s\n", - fs32_to_cpu(sb, de->d_ino), - fs16_to_cpu(sb, de->d_reclen), - ufs_get_de_namlen(sb, de), de->d_name)) - - while (i < bh->b_size) { - if (!ufs_check_dir_entry ("ufs_delete_entry", inode, de, bh, i)) { - brelse(bh); - return -EIO; - } - if (de == dir) { - if (pde) - fs16_add(sb, &pde->d_reclen, - fs16_to_cpu(sb, dir->d_reclen)); - dir->d_ino = 0; - inode->i_version++; - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - mark_buffer_dirty(bh); - if (IS_DIRSYNC(inode)) - sync_dirty_buffer(bh); - brelse(bh); - UFSD(("EXIT\n")) - return 0; + UFSD("ENTER\n"); + + UFSD("ino %u, reclen %u, namlen %u, name %s\n", + fs32_to_cpu(sb, de->d_ino), + fs16_to_cpu(sb, de->d_reclen), + ufs_get_de_namlen(sb, de), de->d_name); + + while ((char*)de < (char*)dir) { + if (de->d_reclen == 0) { + ufs_error(inode->i_sb, __FUNCTION__, + "zero-length directory entry"); + err = -EIO; + goto out; } - i += fs16_to_cpu(sb, de->d_reclen); - if (i == UFS_SECTOR_SIZE) pde = NULL; - else pde = de; - de = (struct ufs_dir_entry *) - ((char *) de + fs16_to_cpu(sb, de->d_reclen)); - if (i == UFS_SECTOR_SIZE && de->d_reclen == 0) - break; + pde = de; + de = ufs_next_entry(sb, de); } - UFSD(("EXIT\n")) - brelse(bh); - return -ENOENT; + if (pde) + from = (char*)pde - (char*)page_address(page); + lock_page(page); + err = mapping->a_ops->prepare_write(NULL, page, from, to); + BUG_ON(err); + if (pde) + pde->d_reclen = cpu_to_fs16(sb, to-from); + dir->d_ino = 0; + err = ufs_commit_chunk(page, from, to); + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +out: + ufs_put_page(page); + UFSD("EXIT\n"); + return err; } int ufs_make_empty(struct inode * inode, struct inode *dir) { struct super_block * sb = dir->i_sb; - struct buffer_head * dir_block; + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); struct ufs_dir_entry * de; + char *base; int err; - dir_block = ufs_bread (inode, 0, 1, &err); - if (!dir_block) - return err; + if (!page) + return -ENOMEM; + kmap(page); + err = mapping->a_ops->prepare_write(NULL, page, 0, UFS_SECTOR_SIZE); + if (err) { + unlock_page(page); + goto fail; + } + + + base = (char*)page_address(page); + memset(base, 0, PAGE_CACHE_SIZE); + + de = (struct ufs_dir_entry *) base; - inode->i_blocks = sb->s_blocksize / UFS_SECTOR_SIZE; - de = (struct ufs_dir_entry *) dir_block->b_data; de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); ufs_set_de_namlen(sb, de, 1); @@ -552,72 +587,65 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE - UFS_DIR_REC_LEN(1)); ufs_set_de_namlen(sb, de, 2); strcpy (de->d_name, ".."); - mark_buffer_dirty(dir_block); - brelse (dir_block); - mark_inode_dirty(inode); - return 0; + + err = ufs_commit_chunk(page, 0, UFS_SECTOR_SIZE); +fail: + kunmap(page); + page_cache_release(page); + return err; } /* * routine to check that the specified directory is empty (for rmdir) */ -int ufs_empty_dir (struct inode * inode) +int ufs_empty_dir(struct inode * inode) { - struct super_block * sb; - unsigned long offset; - struct buffer_head * bh; - struct ufs_dir_entry * de, * de1; - int err; - - sb = inode->i_sb; - - if (inode->i_size < UFS_DIR_REC_LEN(1) + UFS_DIR_REC_LEN(2) || - !(bh = ufs_bread (inode, 0, 0, &err))) { - ufs_warning (inode->i_sb, "empty_dir", - "bad directory (dir #%lu) - no data block", - inode->i_ino); - return 1; - } - de = (struct ufs_dir_entry *) bh->b_data; - de1 = (struct ufs_dir_entry *) - ((char *)de + fs16_to_cpu(sb, de->d_reclen)); - if (fs32_to_cpu(sb, de->d_ino) != inode->i_ino || de1->d_ino == 0 || - strcmp (".", de->d_name) || strcmp ("..", de1->d_name)) { - ufs_warning (inode->i_sb, "empty_dir", - "bad directory (dir #%lu) - no `.' or `..'", - inode->i_ino); - return 1; - } - offset = fs16_to_cpu(sb, de->d_reclen) + fs16_to_cpu(sb, de1->d_reclen); - de = (struct ufs_dir_entry *) - ((char *)de1 + fs16_to_cpu(sb, de1->d_reclen)); - while (offset < inode->i_size ) { - if (!bh || (void *) de >= (void *) (bh->b_data + sb->s_blocksize)) { - brelse (bh); - bh = ufs_bread (inode, offset >> sb->s_blocksize_bits, 1, &err); - if (!bh) { - ufs_error (sb, "empty_dir", - "directory #%lu contains a hole at offset %lu", - inode->i_ino, offset); - offset += sb->s_blocksize; - continue; + struct super_block *sb = inode->i_sb; + struct page *page = NULL; + unsigned long i, npages = ufs_dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct ufs_dir_entry *de; + page = ufs_get_page(inode, i); + + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct ufs_dir_entry *)kaddr; + kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->d_reclen == 0) { + ufs_error(inode->i_sb, __FUNCTION__, + "zero-length directory entry: " + "kaddr=%p, de=%p\n", kaddr, de); + goto not_empty; } - de = (struct ufs_dir_entry *) bh->b_data; - } - if (!ufs_check_dir_entry ("empty_dir", inode, de, bh, offset)) { - brelse (bh); - return 1; - } - if (de->d_ino) { - brelse (bh); - return 0; + if (de->d_ino) { + u16 namelen=ufs_get_de_namlen(sb, de); + /* check for . and .. */ + if (de->d_name[0] != '.') + goto not_empty; + if (namelen > 2) + goto not_empty; + if (namelen < 2) { + if (inode->i_ino != + fs32_to_cpu(sb, de->d_ino)) + goto not_empty; + } else if (de->d_name[1] != '.') + goto not_empty; + } + de = ufs_next_entry(sb, de); } - offset += fs16_to_cpu(sb, de->d_reclen); - de = (struct ufs_dir_entry *) - ((char *)de + fs16_to_cpu(sb, de->d_reclen)); + ufs_put_page(page); } - brelse (bh); return 1; + +not_empty: + ufs_put_page(page); + return 0; } const struct file_operations ufs_dir_operations = { diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 312fd3f86313..0e5001512a9d 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -25,6 +25,26 @@ #include <linux/fs.h> #include <linux/ufs_fs.h> +#include <linux/buffer_head.h> /* for sync_mapping_buffers() */ + +static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + int err; + int ret; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = ufs_sync_inode(inode); + if (ret == 0) + ret = err; + return ret; +} + /* * We have mostly NULL's here: the current defaults are ok for @@ -37,6 +57,7 @@ const struct file_operations ufs_file_operations = { .write = generic_file_write, .mmap = generic_file_mmap, .open = generic_file_open, + .fsync = ufs_sync_file, .sendfile = generic_file_sendfile, }; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index c7a47ed4f430..9501dcd3b213 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -34,14 +34,6 @@ #include "swab.h" #include "util.h" -#undef UFS_IALLOC_DEBUG - -#ifdef UFS_IALLOC_DEBUG -#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x; -#else -#define UFSD(x) -#endif - /* * NOTE! When we get the inode, we're the only people * that have access to it, and as such there are no @@ -68,7 +60,7 @@ void ufs_free_inode (struct inode * inode) int is_directory; unsigned ino, cg, bit; - UFSD(("ENTER, ino %lu\n", inode->i_ino)) + UFSD("ENTER, ino %lu\n", inode->i_ino); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -91,7 +83,7 @@ void ufs_free_inode (struct inode * inode) unlock_super (sb); return; } - ucg = ubh_get_ucg(UCPI_UBH); + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (!ufs_cg_chkmagic(sb, ucg)) ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number"); @@ -104,33 +96,33 @@ void ufs_free_inode (struct inode * inode) clear_inode (inode); - if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit)) + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino); else { - ubh_clrbit (UCPI_UBH, ucpi->c_iusedoff, bit); + ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit); if (ino < ucpi->c_irotor) ucpi->c_irotor = ino; fs32_add(sb, &ucg->cg_cs.cs_nifree, 1); - fs32_add(sb, &usb1->fs_cstotal.cs_nifree, 1); + uspi->cs_total.cs_nifree++; fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1); if (is_directory) { fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1); - fs32_sub(sb, &usb1->fs_cstotal.cs_ndir, 1); + uspi->cs_total.cs_ndir--; fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1); } } - ubh_mark_buffer_dirty (USPI_UBH); - ubh_mark_buffer_dirty (UCPI_UBH); + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi); - ubh_wait_on_buffer (UCPI_UBH); + ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); + ubh_wait_on_buffer (UCPI_UBH(ucpi)); } sb->s_dirt = 1; unlock_super (sb); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); } /* @@ -155,7 +147,7 @@ struct inode * ufs_new_inode(struct inode * dir, int mode) unsigned cg, bit, i, j, start; struct ufs_inode_info *ufsi; - UFSD(("ENTER\n")) + UFSD("ENTER\n"); /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -213,43 +205,43 @@ cg_found: ucpi = ufs_load_cylinder (sb, cg); if (!ucpi) goto failed; - ucg = ubh_get_ucg(UCPI_UBH); + ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (!ufs_cg_chkmagic(sb, ucg)) ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number"); start = ucpi->c_irotor; - bit = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_iusedoff, uspi->s_ipg, start); + bit = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, uspi->s_ipg, start); if (!(bit < uspi->s_ipg)) { - bit = ubh_find_first_zero_bit (UCPI_UBH, ucpi->c_iusedoff, start); + bit = ubh_find_first_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, start); if (!(bit < start)) { ufs_error (sb, "ufs_new_inode", "cylinder group %u corrupted - error in inode bitmap\n", cg); goto failed; } } - UFSD(("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg)) - if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit)) - ubh_setbit (UCPI_UBH, ucpi->c_iusedoff, bit); + UFSD("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg); + if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) + ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit); else { ufs_panic (sb, "ufs_new_inode", "internal error"); goto failed; } fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1); - fs32_sub(sb, &usb1->fs_cstotal.cs_nifree, 1); + uspi->cs_total.cs_nifree--; fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1); if (S_ISDIR(mode)) { fs32_add(sb, &ucg->cg_cs.cs_ndir, 1); - fs32_add(sb, &usb1->fs_cstotal.cs_ndir, 1); + uspi->cs_total.cs_ndir++; fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1); } - ubh_mark_buffer_dirty (USPI_UBH); - ubh_mark_buffer_dirty (UCPI_UBH); + ubh_mark_buffer_dirty (USPI_UBH(uspi)); + ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi); - ubh_wait_on_buffer (UCPI_UBH); + ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); + ubh_wait_on_buffer (UCPI_UBH(ucpi)); } sb->s_dirt = 1; @@ -272,6 +264,7 @@ cg_found: ufsi->i_shadow = 0; ufsi->i_osync = 0; ufsi->i_oeftflag = 0; + ufsi->i_dir_start_lookup = 0; memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1)); insert_inode_hash(inode); @@ -287,14 +280,14 @@ cg_found: return ERR_PTR(-EDQUOT); } - UFSD(("allocating inode %lu\n", inode->i_ino)) - UFSD(("EXIT\n")) + UFSD("allocating inode %lu\n", inode->i_ino); + UFSD("EXIT\n"); return inode; failed: unlock_super (sb); make_bad_inode(inode); iput (inode); - UFSD(("EXIT (FAILED)\n")) + UFSD("EXIT (FAILED)\n"); return ERR_PTR(-ENOSPC); } diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 3c3f62ce2ad9..259bd196099d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -41,14 +41,7 @@ #include "swab.h" #include "util.h" -#undef UFS_INODE_DEBUG -#undef UFS_INODE_DEBUG_MORE - -#ifdef UFS_INODE_DEBUG -#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x; -#else -#define UFSD(x) -#endif +static u64 ufs_frag_map(struct inode *inode, sector_t frag); static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) { @@ -61,7 +54,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off int n = 0; - UFSD(("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks)); + UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks); if (i_block < 0) { ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0"); } else if (i_block < direct_blocks) { @@ -89,7 +82,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off * the begining of the filesystem. */ -u64 ufs_frag_map(struct inode *inode, sector_t frag) +static u64 ufs_frag_map(struct inode *inode, sector_t frag) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; @@ -104,8 +97,8 @@ u64 ufs_frag_map(struct inode *inode, sector_t frag) unsigned flags = UFS_SB(sb)->s_flags; u64 temp = 0L; - UFSD((": frag = %llu depth = %d\n", (unsigned long long)frag, depth)); - UFSD((": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask)); + UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth); + UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask); if (depth == 0) return 0; @@ -161,26 +154,64 @@ out: return ret; } -static struct buffer_head * ufs_inode_getfrag (struct inode *inode, - unsigned int fragment, unsigned int new_fragment, - unsigned int required, int *err, int metadata, long *phys, int *new) +static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh) +{ + lock_buffer(bh); + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + if (IS_SYNC(inode)) + sync_dirty_buffer(bh); +} + +static struct buffer_head * +ufs_clear_frags(struct inode *inode, sector_t beg, + unsigned int n) +{ + struct buffer_head *res, *bh; + sector_t end = beg + n; + + res = sb_getblk(inode->i_sb, beg); + ufs_clear_frag(inode, res); + for (++beg; beg < end; ++beg) { + bh = sb_getblk(inode->i_sb, beg); + ufs_clear_frag(inode, bh); + brelse(bh); + } + return res; +} + +/** + * ufs_inode_getfrag() - allocate new fragment(s) + * @inode - pointer to inode + * @fragment - number of `fragment' which hold pointer + * to new allocated fragment(s) + * @new_fragment - number of new allocated fragment(s) + * @required - how many fragment(s) we require + * @err - we set it if something wrong + * @phys - pointer to where we save physical number of new allocated fragments, + * NULL if we allocate not data(indirect blocks for example). + * @new - we set it if we allocate new block + * @locked_page - for ufs_new_fragments() + */ +static struct buffer_head * +ufs_inode_getfrag(struct inode *inode, unsigned int fragment, + sector_t new_fragment, unsigned int required, int *err, + long *phys, int *new, struct page *locked_page) { struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct buffer_head * result; unsigned block, blockoff, lastfrag, lastblock, lastblockoff; unsigned tmp, goal; __fs32 * p, * p2; - unsigned flags = 0; - UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u, required %u\n", - inode->i_ino, fragment, new_fragment, required)) + UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, required %u, " + "metadata %d\n", inode->i_ino, fragment, + (unsigned long long)new_fragment, required, !phys); - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - flags = UFS_SB(sb)->s_flags; /* TODO : to be done for write support if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) goto ufs2; @@ -195,16 +226,16 @@ repeat: tmp = fs32_to_cpu(sb, *p); lastfrag = ufsi->i_lastfrag; if (tmp && fragment < lastfrag) { - if (metadata) { + if (!phys) { result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); if (tmp == fs32_to_cpu(sb, *p)) { - UFSD(("EXIT, result %u\n", tmp + blockoff)) + UFSD("EXIT, result %u\n", tmp + blockoff); return result; } brelse (result); goto repeat; } else { - *phys = tmp; + *phys = tmp + blockoff; return NULL; } } @@ -221,7 +252,8 @@ repeat: if (lastblockoff) { p2 = ufsi->i_u1.i_data + lastblock; tmp = ufs_new_fragments (inode, p2, lastfrag, - fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff, err); + fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff, + err, locked_page); if (!tmp) { if (lastfrag != ufsi->i_lastfrag) goto repeat; @@ -233,14 +265,16 @@ repeat: } goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb; tmp = ufs_new_fragments (inode, p, fragment - blockoff, - goal, required + blockoff, err); + goal, required + blockoff, + err, locked_page); } /* * We will extend last allocated block */ else if (lastblock == block) { - tmp = ufs_new_fragments (inode, p, fragment - (blockoff - lastblockoff), - fs32_to_cpu(sb, *p), required + (blockoff - lastblockoff), err); + tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff), + fs32_to_cpu(sb, *p), required + (blockoff - lastblockoff), + err, locked_page); } /* * We will allocate new block before last allocated block @@ -248,8 +282,8 @@ repeat: else /* (lastblock > block) */ { if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1]))) goal = tmp + uspi->s_fpb; - tmp = ufs_new_fragments (inode, p, fragment - blockoff, - goal, uspi->s_fpb, err); + tmp = ufs_new_fragments(inode, p, fragment - blockoff, + goal, uspi->s_fpb, err, locked_page); } if (!tmp) { if ((!blockoff && *p) || @@ -259,14 +293,10 @@ repeat: return NULL; } - /* The nullification of framgents done in ufs/balloc.c is - * something I don't have the stomache to move into here right - * now. -DaveM - */ - if (metadata) { - result = sb_getblk(inode->i_sb, tmp + blockoff); + if (!phys) { + result = ufs_clear_frags(inode, tmp + blockoff, required); } else { - *phys = tmp; + *phys = tmp + blockoff; result = NULL; *err = 0; *new = 1; @@ -276,7 +306,7 @@ repeat: if (IS_SYNC(inode)) ufs_sync_inode (inode); mark_inode_dirty(inode); - UFSD(("EXIT, result %u\n", tmp + blockoff)) + UFSD("EXIT, result %u\n", tmp + blockoff); return result; /* This part : To be implemented .... @@ -295,22 +325,35 @@ repeat2: */ } -static struct buffer_head * ufs_block_getfrag (struct inode *inode, - struct buffer_head *bh, unsigned int fragment, unsigned int new_fragment, - unsigned int blocksize, int * err, int metadata, long *phys, int *new) +/** + * ufs_inode_getblock() - allocate new block + * @inode - pointer to inode + * @bh - pointer to block which hold "pointer" to new allocated block + * @fragment - number of `fragment' which hold pointer + * to new allocated block + * @new_fragment - number of new allocated fragment + * (block will hold this fragment and also uspi->s_fpb-1) + * @err - see ufs_inode_getfrag() + * @phys - see ufs_inode_getfrag() + * @new - see ufs_inode_getfrag() + * @locked_page - see ufs_inode_getfrag() + */ +static struct buffer_head * +ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, + unsigned int fragment, sector_t new_fragment, int *err, + long *phys, int *new, struct page *locked_page) { - struct super_block * sb; - struct ufs_sb_private_info * uspi; + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct buffer_head * result; unsigned tmp, goal, block, blockoff; __fs32 * p; - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; block = ufs_fragstoblks (fragment); blockoff = ufs_fragnum (fragment); - UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u\n", inode->i_ino, fragment, new_fragment)) + UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, metadata %d\n", + inode->i_ino, fragment, (unsigned long long)new_fragment, !phys); result = NULL; if (!bh) @@ -326,14 +369,14 @@ static struct buffer_head * ufs_block_getfrag (struct inode *inode, repeat: tmp = fs32_to_cpu(sb, *p); if (tmp) { - if (metadata) { + if (!phys) { result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); if (tmp == fs32_to_cpu(sb, *p)) goto out; brelse (result); goto repeat; } else { - *phys = tmp; + *phys = tmp + blockoff; goto out; } } @@ -342,21 +385,19 @@ repeat: goal = tmp + uspi->s_fpb; else goal = bh->b_blocknr + uspi->s_fpb; - tmp = ufs_new_fragments (inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err); + tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal, + uspi->s_fpb, err, locked_page); if (!tmp) { if (fs32_to_cpu(sb, *p)) goto repeat; goto out; } - /* The nullification of framgents done in ufs/balloc.c is - * something I don't have the stomache to move into here right - * now. -DaveM - */ - if (metadata) { - result = sb_getblk(sb, tmp + blockoff); + + if (!phys) { + result = ufs_clear_frags(inode, tmp + blockoff, uspi->s_fpb); } else { - *phys = tmp; + *phys = tmp + blockoff; *new = 1; } @@ -365,18 +406,19 @@ repeat: sync_dirty_buffer(bh); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - UFSD(("result %u\n", tmp + blockoff)); + UFSD("result %u\n", tmp + blockoff); out: brelse (bh); - UFSD(("EXIT\n")); + UFSD("EXIT\n"); return result; } -/* - * This function gets the block which contains the fragment. +/** + * ufs_getfrag_bloc() - `get_block_t' function, interface between UFS and + * readpage, writepage and so on */ -int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) +int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) { struct super_block * sb = inode->i_sb; struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; @@ -387,7 +429,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea if (!create) { phys64 = ufs_frag_map(inode, fragment); - UFSD(("phys64 = %llu \n",phys64)); + UFSD("phys64 = %llu \n",phys64); if (phys64) map_bh(bh_result, sb, phys64); return 0; @@ -402,7 +444,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea lock_kernel(); - UFSD(("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment)) + UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); if (fragment < 0) goto abort_negative; if (fragment > @@ -418,15 +460,15 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea * it much more readable: */ #define GET_INODE_DATABLOCK(x) \ - ufs_inode_getfrag(inode, x, fragment, 1, &err, 0, &phys, &new) + ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new, bh_result->b_page) #define GET_INODE_PTR(x) \ - ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, 1, NULL, NULL) + ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page) #define GET_INDIRECT_DATABLOCK(x) \ - ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \ - &err, 0, &phys, &new); + ufs_inode_getblock(inode, bh, x, fragment, \ + &err, &phys, &new, bh_result->b_page); #define GET_INDIRECT_PTR(x) \ - ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \ - &err, 1, NULL, NULL); + ufs_inode_getblock(inode, bh, x, fragment, \ + &err, NULL, NULL, bh_result->b_page); if (ptr < UFS_NDIR_FRAGMENT) { bh = GET_INODE_DATABLOCK(ptr); @@ -474,8 +516,9 @@ abort_too_big: goto abort; } -struct buffer_head *ufs_getfrag(struct inode *inode, unsigned int fragment, - int create, int *err) +static struct buffer_head *ufs_getfrag(struct inode *inode, + unsigned int fragment, + int create, int *err) { struct buffer_head dummy; int error; @@ -502,7 +545,7 @@ struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment, { struct buffer_head * bh; - UFSD(("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment)) + UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment); bh = ufs_getfrag (inode, fragment, create, err); if (!bh || buffer_uptodate(bh)) return bh; @@ -540,39 +583,34 @@ struct address_space_operations ufs_aops = { .bmap = ufs_bmap }; -void ufs_read_inode (struct inode * inode) +static void ufs_set_inode_ops(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ufs_file_inode_operations; + inode->i_fop = &ufs_file_operations; + inode->i_mapping->a_ops = &ufs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ufs_dir_inode_operations; + inode->i_fop = &ufs_dir_operations; + inode->i_mapping->a_ops = &ufs_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (!inode->i_blocks) + inode->i_op = &ufs_fast_symlink_inode_operations; + else { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &ufs_aops; + } + } else + init_special_inode(inode, inode->i_mode, + ufs_get_inode_dev(inode->i_sb, UFS_I(inode))); +} + +static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) { struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct ufs_inode * ufs_inode; - struct ufs2_inode *ufs2_inode; - struct buffer_head * bh; + struct super_block *sb = inode->i_sb; mode_t mode; unsigned i; - unsigned flags; - - UFSD(("ENTER, ino %lu\n", inode->i_ino)) - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - flags = UFS_SB(sb)->s_flags; - - if (inode->i_ino < UFS_ROOTINO || - inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { - ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino); - goto bad_inode; - } - - bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); - if (!bh) { - ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino); - goto bad_inode; - } - if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) - goto ufs2_inode; - - ufs_inode = (struct ufs_inode *) (bh->b_data + sizeof(struct ufs_inode) * ufs_inotofsbo(inode->i_ino)); /* * Copy data to the in-core inode. @@ -596,56 +634,29 @@ void ufs_read_inode (struct inode * inode) inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks); - inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat) */ - inode->i_version++; ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags); ufsi->i_gen = fs32_to_cpu(sb, ufs_inode->ui_gen); ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); - ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; + if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) ufsi->i_u1.i_data[i] = ufs_inode->ui_u2.ui_addr.ui_db[i]; - } - else { + } else { for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i]; } - ufsi->i_osync = 0; - - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ufs_file_inode_operations; - inode->i_fop = &ufs_file_operations; - inode->i_mapping->a_ops = &ufs_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &ufs_dir_inode_operations; - inode->i_fop = &ufs_dir_operations; - } else if (S_ISLNK(inode->i_mode)) { - if (!inode->i_blocks) - inode->i_op = &ufs_fast_symlink_inode_operations; - else { - inode->i_op = &page_symlink_inode_operations; - inode->i_mapping->a_ops = &ufs_aops; - } - } else - init_special_inode(inode, inode->i_mode, - ufs_get_inode_dev(sb, ufsi)); - - brelse (bh); - - UFSD(("EXIT\n")) - return; - -bad_inode: - make_bad_inode(inode); - return; - -ufs2_inode : - UFSD(("Reading ufs2 inode, ino %lu\n", inode->i_ino)) +} - ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino)); +static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + mode_t mode; + unsigned i; + UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino); /* * Copy data to the in-core inode. */ @@ -668,50 +679,75 @@ ufs2_inode : inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks); - inode->i_blksize = PAGE_SIZE; /*This is the optimal IO size(for stat)*/ - - inode->i_version++; ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags); ufsi->i_gen = fs32_to_cpu(sb, ufs2_inode->ui_gen); /* ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); */ - ufsi->i_lastfrag= (inode->i_size + uspi->s_fsize- 1) >> uspi->s_fshift; if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) ufsi->i_u1.u2_i_data[i] = ufs2_inode->ui_u2.ui_addr.ui_db[i]; - } - else { + } else { for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i]; } +} + +void ufs_read_inode(struct inode * inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct buffer_head * bh; + + UFSD("ENTER, ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + if (inode->i_ino < UFS_ROOTINO || + inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { + ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n", + inode->i_ino); + goto bad_inode; + } + + bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); + if (!bh) { + ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n", + inode->i_ino); + goto bad_inode; + } + if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data; + + ufs2_read_inode(inode, + ufs2_inode + ufs_inotofsbo(inode->i_ino)); + } else { + struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data; + + ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); + } + + inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/ + inode->i_version++; + ufsi->i_lastfrag = + (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; + ufsi->i_dir_start_lookup = 0; ufsi->i_osync = 0; - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ufs_file_inode_operations; - inode->i_fop = &ufs_file_operations; - inode->i_mapping->a_ops = &ufs_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &ufs_dir_inode_operations; - inode->i_fop = &ufs_dir_operations; - } else if (S_ISLNK(inode->i_mode)) { - if (!inode->i_blocks) - inode->i_op = &ufs_fast_symlink_inode_operations; - else { - inode->i_op = &page_symlink_inode_operations; - inode->i_mapping->a_ops = &ufs_aops; - } - } else /* TODO : here ...*/ - init_special_inode(inode, inode->i_mode, - ufs_get_inode_dev(sb, ufsi)); + ufs_set_inode_ops(inode); brelse(bh); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return; + +bad_inode: + make_bad_inode(inode); } static int ufs_update_inode(struct inode * inode, int do_sync) @@ -724,7 +760,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync) unsigned i; unsigned flags; - UFSD(("ENTER, ino %lu\n", inode->i_ino)) + UFSD("ENTER, ino %lu\n", inode->i_ino); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -785,7 +821,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync) sync_dirty_buffer(bh); brelse (bh); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return 0; } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 8d5f98a01c74..abd5f23a426d 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -1,6 +1,9 @@ /* * linux/fs/ufs/namei.c * + * Migration to usage of "page cache" on May 2006 by + * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base. + * * Copyright (C) 1998 * Daniel Pirkl <daniel.pirkl@email.cz> * Charles University, Faculty of Mathematics and Physics @@ -28,21 +31,9 @@ #include <linux/fs.h> #include <linux/ufs_fs.h> #include <linux/smp_lock.h> -#include <linux/buffer_head.h> #include "swab.h" /* will go away - see comment in mknod() */ #include "util.h" -/* -#undef UFS_NAMEI_DEBUG -*/ -#define UFS_NAMEI_DEBUG - -#ifdef UFS_NAMEI_DEBUG -#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x; -#else -#define UFSD(x) -#endif - static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode) { int err = ufs_add_link(dentry, inode); @@ -88,8 +79,13 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) { - struct inode * inode = ufs_new_inode(dir, mode); - int err = PTR_ERR(inode); + struct inode *inode; + int err; + + UFSD("BEGIN\n"); + inode = ufs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { inode->i_op = &ufs_file_inode_operations; inode->i_fop = &ufs_file_operations; @@ -99,6 +95,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, err = ufs_add_nondir(dentry, inode); unlock_kernel(); } + UFSD("END: err=%d\n", err); return err; } @@ -205,6 +202,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) inode->i_op = &ufs_dir_inode_operations; inode->i_fop = &ufs_dir_operations; + inode->i_mapping->a_ops = &ufs_aops; inode_inc_link_count(inode); @@ -231,19 +229,18 @@ out_dir: goto out; } -static int ufs_unlink(struct inode * dir, struct dentry *dentry) +static int ufs_unlink(struct inode *dir, struct dentry *dentry) { struct inode * inode = dentry->d_inode; - struct buffer_head * bh; - struct ufs_dir_entry * de; + struct ufs_dir_entry *de; + struct page *page; int err = -ENOENT; - lock_kernel(); - de = ufs_find_entry (dentry, &bh); + de = ufs_find_entry(dir, dentry, &page); if (!de) goto out; - err = ufs_delete_entry (dir, de, bh); + err = ufs_delete_entry(dir, de, page); if (err) goto out; @@ -251,7 +248,6 @@ static int ufs_unlink(struct inode * dir, struct dentry *dentry) inode_dec_link_count(inode); err = 0; out: - unlock_kernel(); return err; } @@ -273,42 +269,42 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) return err; } -static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry, - struct inode * new_dir, struct dentry * new_dentry ) +static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) { struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; - struct buffer_head *dir_bh = NULL; - struct ufs_dir_entry *dir_de = NULL; - struct buffer_head *old_bh; + struct page *dir_page = NULL; + struct ufs_dir_entry * dir_de = NULL; + struct page *old_page; struct ufs_dir_entry *old_de; int err = -ENOENT; - lock_kernel(); - old_de = ufs_find_entry (old_dentry, &old_bh); + old_de = ufs_find_entry(old_dir, old_dentry, &old_page); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = ufs_dotdot(old_inode, &dir_bh); + dir_de = ufs_dotdot(old_inode, &dir_page); if (!dir_de) goto out_old; } if (new_inode) { - struct buffer_head *new_bh; + struct page *new_page; struct ufs_dir_entry *new_de; err = -ENOTEMPTY; - if (dir_de && !ufs_empty_dir (new_inode)) + if (dir_de && !ufs_empty_dir(new_inode)) goto out_dir; + err = -ENOENT; - new_de = ufs_find_entry (new_dentry, &new_bh); + new_de = ufs_find_entry(new_dir, new_dentry, &new_page); if (!new_de) goto out_dir; inode_inc_link_count(old_inode); - ufs_set_link(new_dir, new_de, new_bh, old_inode); + ufs_set_link(new_dir, new_de, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) new_inode->i_nlink--; @@ -329,24 +325,32 @@ static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry, inode_inc_link_count(new_dir); } - ufs_delete_entry (old_dir, old_de, old_bh); + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + * inode_dec_link_count() will mark the inode dirty. + */ + old_inode->i_ctime = CURRENT_TIME_SEC; + ufs_delete_entry(old_dir, old_de, old_page); inode_dec_link_count(old_inode); if (dir_de) { - ufs_set_link(old_inode, dir_de, dir_bh, new_dir); + ufs_set_link(old_inode, dir_de, dir_page, new_dir); inode_dec_link_count(old_dir); } - unlock_kernel(); return 0; + out_dir: - if (dir_de) - brelse(dir_bh); + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } out_old: - brelse (old_bh); + kunmap(old_page); + page_cache_release(old_page); out: - unlock_kernel(); return err; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index fe5ab2aa2899..74ef5e9bedff 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -90,95 +90,84 @@ #include "swab.h" #include "util.h" -#undef UFS_SUPER_DEBUG -#undef UFS_SUPER_DEBUG_MORE - - -#undef UFS_SUPER_DEBUG_MORE -#ifdef UFS_SUPER_DEBUG -#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x; -#else -#define UFSD(x) -#endif - -#ifdef UFS_SUPER_DEBUG_MORE +#ifdef CONFIG_UFS_DEBUG /* * Print contents of ufs_super_block, useful for debugging */ -void ufs_print_super_stuff(struct super_block *sb, - struct ufs_super_block_first * usb1, - struct ufs_super_block_second * usb2, - struct ufs_super_block_third * usb3) +static void ufs_print_super_stuff(struct super_block *sb, unsigned flags, + struct ufs_super_block_first *usb1, + struct ufs_super_block_second *usb2, + struct ufs_super_block_third *usb3) { printk("ufs_print_super_stuff\n"); - printk("size of usb: %u\n", sizeof(struct ufs_super_block)); - printk(" magic: 0x%x\n", fs32_to_cpu(sb, usb3->fs_magic)); - printk(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); - printk(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); - printk(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); - printk(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); - printk(" cgoffset: %u\n", fs32_to_cpu(sb, usb1->fs_cgoffset)); - printk(" ~cgmask: 0x%x\n", ~fs32_to_cpu(sb, usb1->fs_cgmask)); - printk(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); - printk(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); - printk(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); - printk(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); - printk(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); - printk(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); - printk(" fragshift: %u\n", fs32_to_cpu(sb, usb1->fs_fragshift)); - printk(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); - printk(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); - printk(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); - printk(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); - printk(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); - printk(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); - printk(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); - printk(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); - printk(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); - printk(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); - printk(" fstodb: %u\n", fs32_to_cpu(sb, usb1->fs_fsbtodb)); - printk(" contigsumsize: %d\n", fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize)); - printk(" postblformat: %u\n", fs32_to_cpu(sb, usb3->fs_postblformat)); - printk(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); - printk(" ndir %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); - printk(" nifree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); - printk(" nbfree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); - printk(" nffree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); - printk("\n"); -} - -/* - * Print contents of ufs2 ufs_super_block, useful for debugging - */ -void ufs2_print_super_stuff( - struct super_block *sb, - struct ufs_super_block *usb) -{ - printk("ufs_print_super_stuff\n"); - printk("size of usb: %u\n", sizeof(struct ufs_super_block)); - printk(" magic: 0x%x\n", fs32_to_cpu(sb, usb->fs_magic)); - printk(" fs_size: %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size)); - printk(" fs_dsize: %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize)); - printk(" bsize: %u\n", fs32_to_cpu(usb, usb->fs_bsize)); - printk(" fsize: %u\n", fs32_to_cpu(usb, usb->fs_fsize)); - printk(" fs_volname: %s\n", usb->fs_u11.fs_u2.fs_volname); - printk(" fs_fsmnt: %s\n", usb->fs_u11.fs_u2.fs_fsmnt); - printk(" fs_sblockloc: %u\n",fs64_to_cpu(sb, - usb->fs_u11.fs_u2.fs_sblockloc)); - printk(" cs_ndir(No of dirs): %u\n",fs64_to_cpu(sb, - usb->fs_u11.fs_u2.fs_cstotal.cs_ndir)); - printk(" cs_nbfree(No of free blocks): %u\n",fs64_to_cpu(sb, - usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree)); + printk(" magic: 0x%x\n", fs32_to_cpu(sb, usb3->fs_magic)); + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + printk(" fs_size: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); + printk(" fs_dsize: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); + printk(" bsize: %u\n", + fs32_to_cpu(sb, usb1->fs_bsize)); + printk(" fsize: %u\n", + fs32_to_cpu(sb, usb1->fs_fsize)); + printk(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); + printk(" fs_sblockloc: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); + printk(" cs_ndir(No of dirs): %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); + printk(" cs_nbfree(No of free blocks): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); + } else { + printk(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); + printk(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); + printk(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); + printk(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); + printk(" cgoffset: %u\n", + fs32_to_cpu(sb, usb1->fs_cgoffset)); + printk(" ~cgmask: 0x%x\n", + ~fs32_to_cpu(sb, usb1->fs_cgmask)); + printk(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); + printk(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); + printk(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); + printk(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); + printk(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); + printk(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); + printk(" fragshift: %u\n", + fs32_to_cpu(sb, usb1->fs_fragshift)); + printk(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); + printk(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); + printk(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); + printk(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); + printk(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); + printk(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); + printk(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); + printk(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); + printk(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); + printk(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); + printk(" fstodb: %u\n", + fs32_to_cpu(sb, usb1->fs_fsbtodb)); + printk(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); + printk(" ndir %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); + printk(" nifree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); + printk(" nbfree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); + printk(" nffree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); + } printk("\n"); } /* * Print contents of ufs_cylinder_group, useful for debugging */ -void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg) +static void ufs_print_cylinder_stuff(struct super_block *sb, + struct ufs_cylinder_group *cg) { printk("\nufs_print_cylinder_stuff\n"); - printk("size of ucg: %u\n", sizeof(struct ufs_cylinder_group)); + printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); printk(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); printk(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); printk(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); @@ -202,12 +191,18 @@ void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group printk(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); printk(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); printk(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); - printk(" clustersumoff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); - printk(" clusteroff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); - printk(" nclusterblks %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); + printk(" clustersumoff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); + printk(" clusteroff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); + printk(" nclusterblks %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); printk("\n"); } -#endif /* UFS_SUPER_DEBUG_MORE */ +#else +# define ufs_print_super_stuff(sb, flags, usb1, usb2, usb3) /**/ +# define ufs_print_cylinder_stuff(sb, cg) /**/ +#endif /* CONFIG_UFS_DEBUG */ static struct super_operations ufs_super_ops; @@ -225,7 +220,7 @@ void ufs_error (struct super_block * sb, const char * function, if (!(sb->s_flags & MS_RDONLY)) { usb1->fs_clean = UFS_FSBAD; - ubh_mark_buffer_dirty(USPI_UBH); + ubh_mark_buffer_dirty(USPI_UBH(uspi)); sb->s_dirt = 1; sb->s_flags |= MS_RDONLY; } @@ -257,7 +252,7 @@ void ufs_panic (struct super_block * sb, const char * function, if (!(sb->s_flags & MS_RDONLY)) { usb1->fs_clean = UFS_FSBAD; - ubh_mark_buffer_dirty(USPI_UBH); + ubh_mark_buffer_dirty(USPI_UBH(uspi)); sb->s_dirt = 1; } va_start (args, fmt); @@ -309,7 +304,7 @@ static int ufs_parse_options (char * options, unsigned * mount_options) { char * p; - UFSD(("ENTER\n")) + UFSD("ENTER\n"); if (!options) return 1; @@ -386,27 +381,57 @@ static int ufs_parse_options (char * options, unsigned * mount_options) } /* + * Diffrent types of UFS hold fs_cstotal in different + * places, and use diffrent data structure for it. + * To make things simplier we just copy fs_cstotal to ufs_sb_private_info + */ +static void ufs_setup_cstotal(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + struct ufs_sb_private_info *uspi = sbi->s_uspi; + struct ufs_super_block_first *usb1; + struct ufs_super_block_second *usb2; + struct ufs_super_block_third *usb3; + unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; + + UFSD("ENTER, mtype=%u\n", mtype); + usb1 = ubh_get_usb_first(uspi); + usb2 = ubh_get_usb_second(uspi); + usb3 = ubh_get_usb_third(uspi); + + if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && + (usb1->fs_flags & UFS_FLAGS_UPDATED)) || + mtype == UFS_MOUNT_UFSTYPE_UFS2) { + /*we have statistic in different place, then usual*/ + uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir); + uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree); + uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree); + uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree); + } else { + uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir); + uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree); + uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree); + uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree); + } + UFSD("EXIT\n"); +} + +/* * Read on-disk structures associated with cylinder groups */ -static int ufs_read_cylinder_structures (struct super_block *sb) +static int ufs_read_cylinder_structures(struct super_block *sb) { - struct ufs_sb_info * sbi = UFS_SB(sb); - struct ufs_sb_private_info * uspi; - struct ufs_super_block *usb; + struct ufs_sb_info *sbi = UFS_SB(sb); + struct ufs_sb_private_info *uspi = sbi->s_uspi; + unsigned flags = sbi->s_flags; struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned size, blks, i; - unsigned flags = 0; - - UFSD(("ENTER\n")) - - uspi = sbi->s_uspi; + struct ufs_super_block_third *usb3; - usb = (struct ufs_super_block *) - ((struct ufs_buffer_head *)uspi)->bh[0]->b_data; + UFSD("ENTER\n"); - flags = UFS_SB(sb)->s_flags; - + usb3 = ubh_get_usb_third(uspi); /* * Read cs structures from (usually) first data block * on the device. @@ -424,7 +449,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb) if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) ubh = ubh_bread(sb, - fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_csaddr) + i, size); + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr) + i, size); else ubh = ubh_bread(sb, uspi->s_csaddr + i, size); @@ -451,14 +476,13 @@ static int ufs_read_cylinder_structures (struct super_block *sb) sbi->s_cgno[i] = UFS_CGNO_EMPTY; } for (i = 0; i < uspi->s_ncg; i++) { - UFSD(("read cg %u\n", i)) + UFSD("read cg %u\n", i); if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i)))) goto failed; if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data)) goto failed; -#ifdef UFS_SUPER_DEBUG_MORE + ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data); -#endif } for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL))) @@ -466,7 +490,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb) sbi->s_cgno[i] = UFS_CGNO_EMPTY; } sbi->s_cg_loaded = 0; - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return 1; failed: @@ -479,26 +503,69 @@ failed: for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) kfree (sbi->s_ucpi[i]); } - UFSD(("EXIT (FAILED)\n")) + UFSD("EXIT (FAILED)\n"); return 0; } /* - * Put on-disk structures associated with cylinder groups and - * write them back to disk + * Sync our internal copy of fs_cstotal with disk */ -static void ufs_put_cylinder_structures (struct super_block *sb) +static void ufs_put_cstotal(struct super_block *sb) { - struct ufs_sb_info * sbi = UFS_SB(sb); - struct ufs_sb_private_info * uspi; + unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_super_block_first *usb1; + struct ufs_super_block_second *usb2; + struct ufs_super_block_third *usb3; + + UFSD("ENTER\n"); + usb1 = ubh_get_usb_first(uspi); + usb2 = ubh_get_usb_second(uspi); + usb3 = ubh_get_usb_third(uspi); + + if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && + (usb1->fs_flags & UFS_FLAGS_UPDATED)) || + mtype == UFS_MOUNT_UFSTYPE_UFS2) { + /*we have statistic in different place, then usual*/ + usb2->fs_un.fs_u2.cs_ndir = + cpu_to_fs64(sb, uspi->cs_total.cs_ndir); + usb2->fs_un.fs_u2.cs_nbfree = + cpu_to_fs64(sb, uspi->cs_total.cs_nbfree); + usb3->fs_un1.fs_u2.cs_nifree = + cpu_to_fs64(sb, uspi->cs_total.cs_nifree); + usb3->fs_un1.fs_u2.cs_nffree = + cpu_to_fs64(sb, uspi->cs_total.cs_nffree); + } else { + usb1->fs_cstotal.cs_ndir = + cpu_to_fs32(sb, uspi->cs_total.cs_ndir); + usb1->fs_cstotal.cs_nbfree = + cpu_to_fs32(sb, uspi->cs_total.cs_nbfree); + usb1->fs_cstotal.cs_nifree = + cpu_to_fs32(sb, uspi->cs_total.cs_nifree); + usb1->fs_cstotal.cs_nffree = + cpu_to_fs32(sb, uspi->cs_total.cs_nffree); + } + ubh_mark_buffer_dirty(USPI_UBH(uspi)); + UFSD("EXIT\n"); +} + +/** + * ufs_put_super_internal() - put on-disk intrenal structures + * @sb: pointer to super_block structure + * Put on-disk structures associated with cylinder groups + * and write them back to disk, also update cs_total on disk + */ +static void ufs_put_super_internal(struct super_block *sb) +{ + struct ufs_sb_info *sbi = UFS_SB(sb); + struct ufs_sb_private_info *uspi = sbi->s_uspi; struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned blks, size, i; - - UFSD(("ENTER\n")) - - uspi = sbi->s_uspi; + + UFSD("ENTER\n"); + ufs_put_cstotal(sb); size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; base = space = (char*) sbi->s_csp; @@ -523,7 +590,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb) brelse (sbi->s_ucg[i]); kfree (sbi->s_ucg); kfree (base); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); } static int ufs_fill_super(struct super_block *sb, void *data, int silent) @@ -533,7 +600,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) struct ufs_super_block_first * usb1; struct ufs_super_block_second * usb2; struct ufs_super_block_third * usb3; - struct ufs_super_block *usb; struct ufs_buffer_head * ubh; struct inode *inode; unsigned block_size, super_block_size; @@ -544,7 +610,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) ubh = NULL; flags = 0; - UFSD(("ENTER\n")) + UFSD("ENTER\n"); sbi = kmalloc(sizeof(struct ufs_sb_info), GFP_KERNEL); if (!sbi) @@ -552,7 +618,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; memset(sbi, 0, sizeof(struct ufs_sb_info)); - UFSD(("flag %u\n", (int)(sb->s_flags & MS_RDONLY))) + UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); #ifndef CONFIG_UFS_FS_WRITE if (!(sb->s_flags & MS_RDONLY)) { @@ -593,7 +659,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) the rules */ switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { case UFS_MOUNT_UFSTYPE_44BSD: - UFSD(("ufstype=44bsd\n")) + UFSD("ufstype=44bsd\n"); uspi->s_fsize = block_size = 512; uspi->s_fmask = ~(512 - 1); uspi->s_fshift = 9; @@ -602,7 +668,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; break; case UFS_MOUNT_UFSTYPE_UFS2: - UFSD(("ufstype=ufs2\n")); + UFSD("ufstype=ufs2\n"); super_block_offset=SBLOCK_UFS2; uspi->s_fsize = block_size = 512; uspi->s_fmask = ~(512 - 1); @@ -617,7 +683,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_SUN: - UFSD(("ufstype=sun\n")) + UFSD("ufstype=sun\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; @@ -628,7 +694,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_SUNx86: - UFSD(("ufstype=sunx86\n")) + UFSD("ufstype=sunx86\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; @@ -639,7 +705,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_OLD: - UFSD(("ufstype=old\n")) + UFSD("ufstype=old\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; @@ -654,7 +720,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_NEXTSTEP: - UFSD(("ufstype=nextstep\n")) + UFSD("ufstype=nextstep\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; @@ -669,7 +735,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD: - UFSD(("ufstype=nextstep-cd\n")) + UFSD("ufstype=nextstep-cd\n"); uspi->s_fsize = block_size = 2048; uspi->s_fmask = ~(2048 - 1); uspi->s_fshift = 11; @@ -684,7 +750,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_OPENSTEP: - UFSD(("ufstype=openstep\n")) + UFSD("ufstype=openstep\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; @@ -699,7 +765,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_HP: - UFSD(("ufstype=hp\n")) + UFSD("ufstype=hp\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; @@ -737,8 +803,6 @@ again: usb1 = ubh_get_usb_first(uspi); usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); - usb = (struct ufs_super_block *) - ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ; /* * Check ufs magic number @@ -820,16 +884,12 @@ magic_found: ubh = NULL; block_size = uspi->s_fsize; super_block_size = uspi->s_sbsize; - UFSD(("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size)) + UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size); goto again; } -#ifdef UFS_SUPER_DEBUG_MORE - if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) - ufs2_print_super_stuff(sb,usb); - else - ufs_print_super_stuff(sb, usb1, usb2, usb3); -#endif + + ufs_print_super_stuff(sb, flags, usb1, usb2, usb3); /* * Check, if file system was correctly unmounted. @@ -842,13 +902,13 @@ magic_found: (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) { switch(usb1->fs_clean) { case UFS_FSCLEAN: - UFSD(("fs is clean\n")) + UFSD("fs is clean\n"); break; case UFS_FSSTABLE: - UFSD(("fs is stable\n")) + UFSD("fs is stable\n"); break; case UFS_FSOSF1: - UFSD(("fs is DEC OSF/1\n")) + UFSD("fs is DEC OSF/1\n"); break; case UFS_FSACTIVE: printk("ufs_read_super: fs is active\n"); @@ -863,8 +923,7 @@ magic_found: sb->s_flags |= MS_RDONLY; break; } - } - else { + } else { printk("ufs_read_super: fs needs fsck\n"); sb->s_flags |= MS_RDONLY; } @@ -884,10 +943,9 @@ magic_found: uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { - uspi->s_u2_size = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size); - uspi->s_u2_dsize = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize); - } - else { + uspi->s_u2_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size); + uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); + } else { uspi->s_size = fs32_to_cpu(sb, usb1->fs_size); uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize); } @@ -901,8 +959,8 @@ magic_found: uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift); uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); - UFSD(("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift, - uspi->s_fshift)); + UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift, + uspi->s_fshift); uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift); uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb); /* s_sbsize already set */ @@ -922,8 +980,8 @@ magic_found: uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc); uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg); uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg); - uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_cpc); - uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize); + uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc); + uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize); uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3); uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3); uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat); @@ -935,12 +993,11 @@ magic_found: * Compute another frequently used values */ uspi->s_fpbmask = uspi->s_fpb - 1; - if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) uspi->s_apbshift = uspi->s_bshift - 3; - } - else { + else uspi->s_apbshift = uspi->s_bshift - 2; - } + uspi->s_2apbshift = uspi->s_apbshift * 2; uspi->s_3apbshift = uspi->s_apbshift * 3; uspi->s_apb = 1 << uspi->s_apbshift; @@ -956,7 +1013,7 @@ magic_found: if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_44BSD) uspi->s_maxsymlinklen = - fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_maxsymlinklen); + fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen); sbi->s_flags = flags; @@ -967,7 +1024,7 @@ magic_found: if (!sb->s_root) goto dalloc_failed; - + ufs_setup_cstotal(sb); /* * Read cylinder group structures */ @@ -975,7 +1032,7 @@ magic_found: if (!ufs_read_cylinder_structures(sb)) goto failed; - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return 0; dalloc_failed: @@ -986,15 +1043,16 @@ failed: kfree (uspi); kfree(sbi); sb->s_fs_info = NULL; - UFSD(("EXIT (FAILED)\n")) + UFSD("EXIT (FAILED)\n"); return -EINVAL; failed_nomem: - UFSD(("EXIT (NOMEM)\n")) + UFSD("EXIT (NOMEM)\n"); return -ENOMEM; } -static void ufs_write_super (struct super_block *sb) { +static void ufs_write_super(struct super_block *sb) +{ struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_super_block_third * usb3; @@ -1002,7 +1060,7 @@ static void ufs_write_super (struct super_block *sb) { lock_kernel(); - UFSD(("ENTER\n")) + UFSD("ENTER\n"); flags = UFS_SB(sb)->s_flags; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); @@ -1014,26 +1072,27 @@ static void ufs_write_super (struct super_block *sb) { || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) ufs_set_fs_state(sb, usb1, usb3, UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); - ubh_mark_buffer_dirty (USPI_UBH); + ufs_put_cstotal(sb); } sb->s_dirt = 0; - UFSD(("EXIT\n")) + UFSD("EXIT\n"); unlock_kernel(); } -static void ufs_put_super (struct super_block *sb) +static void ufs_put_super(struct super_block *sb) { struct ufs_sb_info * sbi = UFS_SB(sb); - UFSD(("ENTER\n")) + UFSD("ENTER\n"); if (!(sb->s_flags & MS_RDONLY)) - ufs_put_cylinder_structures (sb); + ufs_put_super_internal(sb); ubh_brelse_uspi (sbi->s_uspi); kfree (sbi->s_uspi); kfree (sbi); sb->s_fs_info = NULL; + UFSD("EXIT\n"); return; } @@ -1062,8 +1121,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) return -EINVAL; if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { new_mount_opt |= ufstype; - } - else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { + } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { printk("ufstype can't be changed during remount\n"); return -EINVAL; } @@ -1077,20 +1135,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) * fs was mouted as rw, remounting ro */ if (*mount_flags & MS_RDONLY) { - ufs_put_cylinder_structures(sb); + ufs_put_super_internal(sb); usb1->fs_time = cpu_to_fs32(sb, get_seconds()); if ((flags & UFS_ST_MASK) == UFS_ST_SUN || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) ufs_set_fs_state(sb, usb1, usb3, UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); - ubh_mark_buffer_dirty (USPI_UBH); + ubh_mark_buffer_dirty (USPI_UBH(uspi)); sb->s_dirt = 0; sb->s_flags |= MS_RDONLY; - } + } else { /* * fs was mounted as ro, remounting rw */ - else { #ifndef CONFIG_UFS_FS_WRITE printk("ufs was compiled with read-only support, " "can't be mounted as read-write\n"); @@ -1102,7 +1159,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) printk("this ufstype is read-only supported\n"); return -EINVAL; } - if (!ufs_read_cylinder_structures (sb)) { + if (!ufs_read_cylinder_structures(sb)) { printk("failed during remounting\n"); return -EPERM; } @@ -1113,37 +1170,31 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) return 0; } -static int ufs_statfs (struct dentry *dentry, struct kstatfs *buf) +static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; - struct ufs_super_block * usb; - unsigned flags = 0; + struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; + unsigned flags = UFS_SB(sb)->s_flags; + struct ufs_super_block_first *usb1; + struct ufs_super_block_second *usb2; + struct ufs_super_block_third *usb3; lock_kernel(); - uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first (uspi); - usb = (struct ufs_super_block *) - ((struct ufs_buffer_head *)uspi)->bh[0]->b_data ; + usb1 = ubh_get_usb_first(uspi); + usb2 = ubh_get_usb_second(uspi); + usb3 = ubh_get_usb_third(uspi); - flags = UFS_SB(sb)->s_flags; if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { buf->f_type = UFS2_MAGIC; - buf->f_blocks = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize); - buf->f_bfree = ufs_blkstofrags(fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree)) + - fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nffree); - buf->f_ffree = fs64_to_cpu(sb, - usb->fs_u11.fs_u2.fs_cstotal.cs_nifree); - } - else { + buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); + } else { buf->f_type = UFS_MAGIC; buf->f_blocks = uspi->s_dsize; - buf->f_bfree = ufs_blkstofrags(fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)) + - fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree); - buf->f_ffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree); } + buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) + + uspi->cs_total.cs_nffree; + buf->f_ffree = uspi->cs_total.cs_nifree; buf->f_bsize = sb->s_blocksize; buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree)) ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 02e86291ef8a..3c3b301f8701 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -49,14 +49,6 @@ #include "swab.h" #include "util.h" -#undef UFS_TRUNCATE_DEBUG - -#ifdef UFS_TRUNCATE_DEBUG -#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x; -#else -#define UFSD(x) -#endif - /* * Secure deletion currently doesn't work. It interacts very badly * with buffers shared with memory mappings, and for that reason @@ -82,7 +74,7 @@ static int ufs_trunc_direct (struct inode * inode) unsigned i, tmp; int retry; - UFSD(("ENTER\n")) + UFSD("ENTER\n"); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -105,7 +97,7 @@ static int ufs_trunc_direct (struct inode * inode) block2 = ufs_fragstoblks (frag3); } - UFSD(("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4)) + UFSD("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4); if (frag1 >= frag2) goto next1; @@ -120,9 +112,8 @@ static int ufs_trunc_direct (struct inode * inode) frag1 = ufs_fragnum (frag1); frag2 = ufs_fragnum (frag2); - inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift; - mark_inode_dirty(inode); ufs_free_fragments (inode, tmp + frag1, frag2 - frag1); + mark_inode_dirty(inode); frag_to_free = tmp + frag1; next1: @@ -136,8 +127,7 @@ next1: continue; *p = 0; - inode->i_blocks -= uspi->s_nspb; - mark_inode_dirty(inode); + if (free_count == 0) { frag_to_free = tmp; free_count = uspi->s_fpb; @@ -148,6 +138,7 @@ next1: frag_to_free = tmp; free_count = uspi->s_fpb; } + mark_inode_dirty(inode); } if (free_count > 0) @@ -166,12 +157,12 @@ next1: frag4 = ufs_fragnum (frag4); *p = 0; - inode->i_blocks -= frag4 << uspi->s_nspfshift; - mark_inode_dirty(inode); + ufs_free_fragments (inode, tmp, frag4); + mark_inode_dirty(inode); next3: - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return retry; } @@ -186,7 +177,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) unsigned frag_to_free, free_count; int retry; - UFSD(("ENTER\n")) + UFSD("ENTER\n"); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -227,7 +218,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) frag_to_free = tmp; free_count = uspi->s_fpb; } - inode->i_blocks -= uspi->s_nspb; + mark_inode_dirty(inode); } @@ -238,26 +229,21 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) if (*ubh_get_addr32(ind_ubh,i)) break; if (i >= uspi->s_apb) { - if (ubh_max_bcount(ind_ubh) != 1) { - retry = 1; - } - else { - tmp = fs32_to_cpu(sb, *p); - *p = 0; - inode->i_blocks -= uspi->s_nspb; - mark_inode_dirty(inode); - ufs_free_blocks (inode, tmp, uspi->s_fpb); - ubh_bforget(ind_ubh); - ind_ubh = NULL; - } + tmp = fs32_to_cpu(sb, *p); + *p = 0; + + ufs_free_blocks (inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + ubh_bforget(ind_ubh); + ind_ubh = NULL; } if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) { - ubh_ll_rw_block (SWRITE, 1, &ind_ubh); + ubh_ll_rw_block(SWRITE, ind_ubh); ubh_wait_on_buffer (ind_ubh); } ubh_brelse (ind_ubh); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return retry; } @@ -271,7 +257,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p) __fs32 * dind; int retry = 0; - UFSD(("ENTER\n")) + UFSD("ENTER\n"); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -306,25 +292,21 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p) if (*ubh_get_addr32 (dind_bh, i)) break; if (i >= uspi->s_apb) { - if (ubh_max_bcount(dind_bh) != 1) - retry = 1; - else { - tmp = fs32_to_cpu(sb, *p); - *p = 0; - inode->i_blocks -= uspi->s_nspb; - mark_inode_dirty(inode); - ufs_free_blocks (inode, tmp, uspi->s_fpb); - ubh_bforget(dind_bh); - dind_bh = NULL; - } + tmp = fs32_to_cpu(sb, *p); + *p = 0; + + ufs_free_blocks(inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + ubh_bforget(dind_bh); + dind_bh = NULL; } if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) { - ubh_ll_rw_block (SWRITE, 1, &dind_bh); + ubh_ll_rw_block(SWRITE, dind_bh); ubh_wait_on_buffer (dind_bh); } ubh_brelse (dind_bh); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return retry; } @@ -339,7 +321,7 @@ static int ufs_trunc_tindirect (struct inode * inode) __fs32 * tind, * p; int retry; - UFSD(("ENTER\n")) + UFSD("ENTER\n"); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -370,25 +352,21 @@ static int ufs_trunc_tindirect (struct inode * inode) if (*ubh_get_addr32 (tind_bh, i)) break; if (i >= uspi->s_apb) { - if (ubh_max_bcount(tind_bh) != 1) - retry = 1; - else { - tmp = fs32_to_cpu(sb, *p); - *p = 0; - inode->i_blocks -= uspi->s_nspb; - mark_inode_dirty(inode); - ufs_free_blocks (inode, tmp, uspi->s_fpb); - ubh_bforget(tind_bh); - tind_bh = NULL; - } + tmp = fs32_to_cpu(sb, *p); + *p = 0; + + ufs_free_blocks(inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + ubh_bforget(tind_bh); + tind_bh = NULL; } if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) { - ubh_ll_rw_block (SWRITE, 1, &tind_bh); + ubh_ll_rw_block(SWRITE, tind_bh); ubh_wait_on_buffer (tind_bh); } ubh_brelse (tind_bh); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); return retry; } @@ -399,7 +377,7 @@ void ufs_truncate (struct inode * inode) struct ufs_sb_private_info * uspi; int retry; - UFSD(("ENTER\n")) + UFSD("ENTER\n"); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -430,5 +408,5 @@ void ufs_truncate (struct inode * inode) ufsi->i_lastfrag = DIRECT_FRAGMENT; unlock_kernel(); mark_inode_dirty(inode); - UFSD(("EXIT\n")) + UFSD("EXIT\n"); } diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 59acc8f073ac..a2f13f45708b 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -14,15 +14,6 @@ #include "swab.h" #include "util.h" -#undef UFS_UTILS_DEBUG - -#ifdef UFS_UTILS_DEBUG -#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x; -#else -#define UFSD(x) -#endif - - struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi, struct super_block *sb, u64 fragment, u64 size) { @@ -63,17 +54,17 @@ struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi, count = size >> uspi->s_fshift; if (count <= 0 || count > UFS_MAXFRAG) return NULL; - USPI_UBH->fragment = fragment; - USPI_UBH->count = count; + USPI_UBH(uspi)->fragment = fragment; + USPI_UBH(uspi)->count = count; for (i = 0; i < count; i++) - if (!(USPI_UBH->bh[i] = sb_bread(sb, fragment + i))) + if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i))) goto failed; for (; i < UFS_MAXFRAG; i++) - USPI_UBH->bh[i] = NULL; - return USPI_UBH; + USPI_UBH(uspi)->bh[i] = NULL; + return USPI_UBH(uspi); failed: for (j = 0; j < i; j++) - brelse (USPI_UBH->bh[j]); + brelse (USPI_UBH(uspi)->bh[j]); return NULL; } @@ -90,11 +81,11 @@ void ubh_brelse (struct ufs_buffer_head * ubh) void ubh_brelse_uspi (struct ufs_sb_private_info * uspi) { unsigned i; - if (!USPI_UBH) + if (!USPI_UBH(uspi)) return; - for ( i = 0; i < USPI_UBH->count; i++ ) { - brelse (USPI_UBH->bh[i]); - USPI_UBH->bh[i] = NULL; + for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) { + brelse (USPI_UBH(uspi)->bh[i]); + USPI_UBH(uspi)->bh[i] = NULL; } } @@ -121,13 +112,12 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag) } } -void ubh_ll_rw_block (int rw, unsigned nr, struct ufs_buffer_head * ubh[]) +void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh) { - unsigned i; if (!ubh) return; - for ( i = 0; i < nr; i++ ) - ll_rw_block (rw, ubh[i]->count, ubh[i]->bh); + + ll_rw_block(rw, ubh->count, ubh->bh); } void ubh_wait_on_buffer (struct ufs_buffer_head * ubh) @@ -139,18 +129,6 @@ void ubh_wait_on_buffer (struct ufs_buffer_head * ubh) wait_on_buffer (ubh->bh[i]); } -unsigned ubh_max_bcount (struct ufs_buffer_head * ubh) -{ - unsigned i; - unsigned max = 0; - if (!ubh) - return 0; - for ( i = 0; i < ubh->count; i++ ) - if ( atomic_read(&ubh->bh[i]->b_count) > max ) - max = atomic_read(&ubh->bh[i]->b_count); - return max; -} - void ubh_bforget (struct ufs_buffer_head * ubh) { unsigned i; diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 48d6d9bcc157..406981fff5e7 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -17,10 +17,16 @@ #define in_range(b,first,len) ((b)>=(first)&&(b)<(first)+(len)) /* - * macros used for retyping + * functions used for retyping */ -#define UCPI_UBH ((struct ufs_buffer_head *)ucpi) -#define USPI_UBH ((struct ufs_buffer_head *)uspi) +static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi) +{ + return &cpi->c_ubh; +} +static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi) +{ + return &spi->s_ubh; +} @@ -33,12 +39,12 @@ ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, { switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUN: - return fs32_to_cpu(sb, usb3->fs_u2.fs_sun.fs_state); + return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state); case UFS_ST_SUNx86: return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state); case UFS_ST_44BSD: default: - return fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_state); + return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state); } } @@ -48,13 +54,13 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1, { switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUN: - usb3->fs_u2.fs_sun.fs_state = cpu_to_fs32(sb, value); + usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value); break; case UFS_ST_SUNx86: usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value); break; case UFS_ST_44BSD: - usb3->fs_u2.fs_44.fs_state = cpu_to_fs32(sb, value); + usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value); break; } } @@ -64,7 +70,7 @@ ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1, struct ufs_super_block_third *usb3) { if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) - return fs32_to_cpu(sb, usb3->fs_u2.fs_sunx86.fs_npsect); + return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect); else return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect); } @@ -76,16 +82,16 @@ ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3) switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUN: - ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qbmask[0]; - ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qbmask[1]; + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1]; break; case UFS_ST_SUNx86: - ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qbmask[0]; - ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qbmask[1]; + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1]; break; case UFS_ST_44BSD: - ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qbmask[0]; - ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qbmask[1]; + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1]; break; } @@ -99,16 +105,16 @@ ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3) switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUN: - ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qfmask[0]; - ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qfmask[1]; + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1]; break; case UFS_ST_SUNx86: - ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qfmask[0]; - ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qfmask[1]; + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1]; break; case UFS_ST_44BSD: - ((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qfmask[0]; - ((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qfmask[1]; + ((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0]; + ((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1]; break; } @@ -236,9 +242,8 @@ extern void ubh_brelse (struct ufs_buffer_head *); extern void ubh_brelse_uspi (struct ufs_sb_private_info *); extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int); -extern void ubh_ll_rw_block (int, unsigned, struct ufs_buffer_head **); +extern void ubh_ll_rw_block(int, struct ufs_buffer_head *); extern void ubh_wait_on_buffer (struct ufs_buffer_head *); -extern unsigned ubh_max_bcount (struct ufs_buffer_head *); extern void ubh_bforget (struct ufs_buffer_head *); extern int ubh_buffer_dirty (struct ufs_buffer_head *); #define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size) @@ -297,40 +302,26 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi, #define ubh_blkmap(ubh,begin,bit) \ ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb))) - -/* - * Macros for access to superblock array structures - */ -#define ubh_postbl(ubh,cylno,i) \ - ((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \ - ? (*(__s16*)(ubh_get_addr(ubh, \ - (unsigned)(&((struct ufs_super_block *)0)->fs_opostbl) \ - + (((cylno) * 16 + (i)) << 1) ) )) \ - : (*(__s16*)(ubh_get_addr(ubh, \ - uspi->s_postbloff + (((cylno) * uspi->s_nrpos + (i)) << 1) )))) - -#define ubh_rotbl(ubh,i) \ - ((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \ - ? (*(__u8*)(ubh_get_addr(ubh, \ - (unsigned)(&((struct ufs_super_block *)0)->fs_space) + (i)))) \ - : (*(__u8*)(ubh_get_addr(ubh, uspi->s_rotbloff + (i))))) - /* * Determine the number of available frags given a * percentage to hold in reserve. */ -#define ufs_freespace(usb, percentreserved) \ - (ufs_blkstofrags(fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nbfree)) + \ - fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nffree) - (uspi->s_dsize * (percentreserved) / 100)) +static inline u64 +ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved) +{ + return ufs_blkstofrags(uspi->cs_total.cs_nbfree) + + uspi->cs_total.cs_nffree - + (uspi->s_dsize * (percentreserved) / 100); +} /* * Macros to access cylinder group array structures */ #define ubh_cg_blktot(ucpi,cylno) \ - (*((__fs32*)ubh_get_addr(UCPI_UBH, (ucpi)->c_btotoff + ((cylno) << 2)))) + (*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2)))) #define ubh_cg_blks(ucpi,cylno,rpos) \ - (*((__fs16*)ubh_get_addr(UCPI_UBH, \ + (*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \ (ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 )))) /* @@ -508,29 +499,3 @@ static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap, if (fragsize > 0 && fragsize < uspi->s_fpb) fs32_add(sb, &fraglist[fragsize], cnt); } - -#define ubh_scanc(ubh,begin,size,table,mask) _ubh_scanc_(uspi,ubh,begin,size,table,mask) -static inline unsigned _ubh_scanc_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, - unsigned begin, unsigned size, unsigned char * table, unsigned char mask) -{ - unsigned rest, offset; - unsigned char * cp; - - - offset = begin & ~uspi->s_fmask; - begin >>= uspi->s_fshift; - for (;;) { - if ((offset + size) < uspi->s_fsize) - rest = size; - else - rest = uspi->s_fsize - offset; - size -= rest; - cp = ubh->bh[begin]->b_data + offset; - while ((table[*cp++] & mask) == 0 && --rest); - if (rest || !size) - break; - begin++; - offset = 0; - } - return (size + rest); -} diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 70662371bb11..3d4f6dff2113 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -299,7 +299,8 @@ xfs_file_open( STATIC int xfs_file_close( - struct file *filp) + struct file *filp, + fl_owner_t id) { return -bhv_vop_close(vn_from_inode(filp->f_dentry->d_inode), 0, file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 10dbf203c62f..ed7579beb6b0 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1721,15 +1721,14 @@ xfs_mount_log_sbunit( * is present to prevent thrashing). */ +#ifdef CONFIG_HOTPLUG_CPU /* * hot-plug CPU notifier support. * - * We cannot use the hotcpu_register() function because it does - * not allow notifier instances. We need a notifier per filesystem - * as we need to be able to identify the filesystem to balance - * the counters out. This is achieved by having a notifier block - * embedded in the xfs_mount_t and doing pointer magic to get the - * mount pointer from the notifier block address. + * We need a notifier per filesystem as we need to be able to identify + * the filesystem to balance the counters out. This is achieved by + * having a notifier block embedded in the xfs_mount_t and doing pointer + * magic to get the mount pointer from the notifier block address. */ STATIC int xfs_icsb_cpu_notify( @@ -1779,6 +1778,7 @@ xfs_icsb_cpu_notify( return NOTIFY_OK; } +#endif /* CONFIG_HOTPLUG_CPU */ int xfs_icsb_init_counters( @@ -1791,9 +1791,11 @@ xfs_icsb_init_counters( if (mp->m_sb_cnts == NULL) return -ENOMEM; +#ifdef CONFIG_HOTPLUG_CPU mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; mp->m_icsb_notifier.priority = 0; - register_cpu_notifier(&mp->m_icsb_notifier); + register_hotcpu_notifier(&mp->m_icsb_notifier); +#endif /* CONFIG_HOTPLUG_CPU */ for_each_online_cpu(i) { cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); @@ -1812,7 +1814,7 @@ xfs_icsb_destroy_counters( xfs_mount_t *mp) { if (mp->m_sb_cnts) { - unregister_cpu_notifier(&mp->m_icsb_notifier); + unregister_hotcpu_notifier(&mp->m_icsb_notifier); free_percpu(mp->m_sb_cnts); } } |