diff options
author | Michal Marek <mmarek@suse.cz> | 2010-10-28 00:15:57 +0200 |
---|---|---|
committer | Michal Marek <mmarek@suse.cz> | 2010-10-28 00:15:57 +0200 |
commit | b74b953b998bcc2db91b694446f3a2619ec32de6 (patch) | |
tree | 6ce24caabd730f6ae9287ed0676ec32e6ff31e9d /ipc | |
parent | abb438526201c6a79949ad45375c051b6681c253 (diff) | |
parent | f6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff) |
Merge commit 'v2.6.36' into kbuild/misc
Update to be able to fix a recent change to scripts/basic/docproc.c
(commit eda603f).
Diffstat (limited to 'ipc')
-rw-r--r-- | ipc/Makefile | 2 | ||||
-rw-r--r-- | ipc/compat.c | 1 | ||||
-rw-r--r-- | ipc/mqueue.c | 212 | ||||
-rw-r--r-- | ipc/msg.c | 13 | ||||
-rw-r--r-- | ipc/sem.c | 370 | ||||
-rw-r--r-- | ipc/shm.c | 14 | ||||
-rw-r--r-- | ipc/syscall.c | 99 | ||||
-rw-r--r-- | ipc/util.c | 4 |
8 files changed, 493 insertions, 222 deletions
diff --git a/ipc/Makefile b/ipc/Makefile index 4e1955ea815d..9075e172e52c 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -3,7 +3,7 @@ # obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o -obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o +obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o syscall.o obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o obj_mq-$(CONFIG_COMPAT) += compat_mq.o obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) diff --git a/ipc/compat.c b/ipc/compat.c index ab76fb0ef844..9dc2c7d3c9e6 100644 --- a/ipc/compat.c +++ b/ipc/compat.c @@ -26,7 +26,6 @@ #include <linux/init.h> #include <linux/msg.h> #include <linux/shm.h> -#include <linux/slab.h> #include <linux/syscalls.h> #include <linux/mutex.h> diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c79bd57353e7..c60e519e2917 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -32,6 +32,7 @@ #include <linux/nsproxy.h> #include <linux/pid.h> #include <linux/ipc_namespace.h> +#include <linux/slab.h> #include <net/sock.h> #include "util.h" @@ -134,7 +135,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, init_waitqueue_head(&info->wait_q); INIT_LIST_HEAD(&info->e_wait_q[0].list); INIT_LIST_HEAD(&info->e_wait_q[1].list); - info->messages = NULL; info->notify_owner = NULL; info->qsize = 0; info->user = NULL; /* set when all is ok */ @@ -146,26 +146,24 @@ static struct inode *mqueue_get_inode(struct super_block *sb, info->attr.mq_msgsize = attr->mq_msgsize; } mq_msg_tblsz = info->attr.mq_maxmsg * sizeof(struct msg_msg *); + info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL); + if (!info->messages) + goto out_inode; + mq_bytes = (mq_msg_tblsz + (info->attr.mq_maxmsg * info->attr.mq_msgsize)); spin_lock(&mq_lock); if (u->mq_bytes + mq_bytes < u->mq_bytes || u->mq_bytes + mq_bytes > - p->signal->rlim[RLIMIT_MSGQUEUE].rlim_cur) { + task_rlimit(p, RLIMIT_MSGQUEUE)) { spin_unlock(&mq_lock); + /* mqueue_evict_inode() releases info->messages */ goto out_inode; } u->mq_bytes += mq_bytes; spin_unlock(&mq_lock); - info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL); - if (!info->messages) { - spin_lock(&mq_lock); - u->mq_bytes -= mq_bytes; - spin_unlock(&mq_lock); - goto out_inode; - } /* all is ok */ info->user = get_uid(u); } else if (S_ISDIR(mode)) { @@ -178,7 +176,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, } return inode; out_inode: - make_bad_inode(inode); iput(inode); return NULL; } @@ -187,7 +184,7 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; struct ipc_namespace *ns = data; - int error = 0; + int error; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; @@ -205,7 +202,9 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) { iput(inode); error = -ENOMEM; + goto out; } + error = 0; out: return error; @@ -242,7 +241,7 @@ static void mqueue_destroy_inode(struct inode *inode) kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode)); } -static void mqueue_delete_inode(struct inode *inode) +static void mqueue_evict_inode(struct inode *inode) { struct mqueue_inode_info *info; struct user_struct *user; @@ -250,10 +249,11 @@ static void mqueue_delete_inode(struct inode *inode) int i; struct ipc_namespace *ipc_ns; - if (S_ISDIR(inode->i_mode)) { - clear_inode(inode); + end_writeback(inode); + + if (S_ISDIR(inode->i_mode)) return; - } + ipc_ns = get_ns_from_inode(inode); info = MQUEUE_I(inode); spin_lock(&info->lock); @@ -262,10 +262,9 @@ static void mqueue_delete_inode(struct inode *inode) kfree(info->messages); spin_unlock(&info->lock); - clear_inode(inode); - - mq_bytes = (info->attr.mq_maxmsg * sizeof(struct msg_msg *) + - (info->attr.mq_maxmsg * info->attr.mq_msgsize)); + /* Total amount of bytes accounted for the mqueue */ + mq_bytes = info->attr.mq_maxmsg * (sizeof(struct msg_msg *) + + info->attr.mq_msgsize); user = info->user; if (user) { spin_lock(&mq_lock); @@ -428,7 +427,7 @@ static void wq_add(struct mqueue_inode_info *info, int sr, * sr: SEND or RECV */ static int wq_sleep(struct mqueue_inode_info *info, int sr, - long timeout, struct ext_wait_queue *ewp) + ktime_t *timeout, struct ext_wait_queue *ewp) { int retval; signed long time; @@ -439,7 +438,8 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr, set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&info->lock); - time = schedule_timeout(timeout); + time = schedule_hrtimeout_range_clock(timeout, + HRTIMER_MODE_ABS, 0, CLOCK_REALTIME); while (ewp->state == STATE_PENDING) cpu_relax(); @@ -551,31 +551,16 @@ static void __do_notify(struct mqueue_inode_info *info) wake_up(&info->wait_q); } -static long prepare_timeout(struct timespec *p) +static int prepare_timeout(const struct timespec __user *u_abs_timeout, + ktime_t *expires, struct timespec *ts) { - struct timespec nowts; - long timeout; - - if (p) { - if (unlikely(p->tv_nsec < 0 || p->tv_sec < 0 - || p->tv_nsec >= NSEC_PER_SEC)) - return -EINVAL; - nowts = CURRENT_TIME; - /* first subtract as jiffies can't be too big */ - p->tv_sec -= nowts.tv_sec; - if (p->tv_nsec < nowts.tv_nsec) { - p->tv_nsec += NSEC_PER_SEC; - p->tv_sec--; - } - p->tv_nsec -= nowts.tv_nsec; - if (p->tv_sec < 0) - return 0; - - timeout = timespec_to_jiffies(p) + 1; - } else - return MAX_SCHEDULE_TIMEOUT; + if (copy_from_user(ts, u_abs_timeout, sizeof(struct timespec))) + return -EFAULT; + if (!timespec_valid(ts)) + return -EINVAL; - return timeout; + *expires = timespec_to_ktime(*ts); + return 0; } static void remove_notification(struct mqueue_inode_info *info) @@ -604,8 +589,8 @@ static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr) /* check for overflow */ if (attr->mq_msgsize > ULONG_MAX/attr->mq_maxmsg) return 0; - if ((unsigned long)(attr->mq_maxmsg * attr->mq_msgsize) + - (attr->mq_maxmsg * sizeof (struct msg_msg *)) < + if ((unsigned long)(attr->mq_maxmsg * (attr->mq_msgsize + + sizeof (struct msg_msg *))) < (unsigned long)(attr->mq_maxmsg * attr->mq_msgsize)) return 0; return 1; @@ -623,9 +608,10 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct dentry *dir, int ret; if (attr) { - ret = -EINVAL; - if (!mq_attr_ok(ipc_ns, attr)) + if (!mq_attr_ok(ipc_ns, attr)) { + ret = -EINVAL; goto out; + } /* store for use during create */ dentry->d_fsdata = attr; } @@ -659,24 +645,28 @@ out: static struct file *do_open(struct ipc_namespace *ipc_ns, struct dentry *dentry, int oflag) { + int ret; const struct cred *cred = current_cred(); static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE, MAY_READ | MAY_WRITE }; if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) { - dput(dentry); - mntput(ipc_ns->mq_mnt); - return ERR_PTR(-EINVAL); + ret = -EINVAL; + goto err; } if (inode_permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE])) { - dput(dentry); - mntput(ipc_ns->mq_mnt); - return ERR_PTR(-EACCES); + ret = -EACCES; + goto err; } return dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred); + +err: + dput(dentry); + mntput(ipc_ns->mq_mnt); + return ERR_PTR(ret); } SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode, @@ -705,16 +695,17 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode, dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name)); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); - goto out_err; + goto out_putfd; } mntget(ipc_ns->mq_mnt); if (oflag & O_CREAT) { if (dentry->d_inode) { /* entry already exists */ audit_inode(name, dentry); - error = -EEXIST; - if (oflag & O_EXCL) + if (oflag & O_EXCL) { + error = -EEXIST; goto out; + } filp = do_open(ipc_ns, dentry, oflag); } else { filp = do_create(ipc_ns, ipc_ns->mq_mnt->mnt_root, @@ -722,9 +713,10 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode, u_attr ? &attr : NULL); } } else { - error = -ENOENT; - if (!dentry->d_inode) + if (!dentry->d_inode) { + error = -ENOENT; goto out; + } audit_inode(name, dentry); filp = do_open(ipc_ns, dentry, oflag); } @@ -742,7 +734,6 @@ out: mntput(ipc_ns->mq_mnt); out_putfd: put_unused_fd(fd); -out_err: fd = error; out_upsem: mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex); @@ -855,36 +846,40 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, struct ext_wait_queue *receiver; struct msg_msg *msg_ptr; struct mqueue_inode_info *info; - struct timespec ts, *p = NULL; - long timeout; + ktime_t expires, *timeout = NULL; + struct timespec ts; int ret; if (u_abs_timeout) { - if (copy_from_user(&ts, u_abs_timeout, - sizeof(struct timespec))) - return -EFAULT; - p = &ts; + int res = prepare_timeout(u_abs_timeout, &expires, &ts); + if (res) + return res; + timeout = &expires; } if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX)) return -EINVAL; - audit_mq_sendrecv(mqdes, msg_len, msg_prio, p); - timeout = prepare_timeout(p); + audit_mq_sendrecv(mqdes, msg_len, msg_prio, timeout ? &ts : NULL); - ret = -EBADF; filp = fget(mqdes); - if (unlikely(!filp)) + if (unlikely(!filp)) { + ret = -EBADF; goto out; + } inode = filp->f_path.dentry->d_inode; - if (unlikely(filp->f_op != &mqueue_file_operations)) + if (unlikely(filp->f_op != &mqueue_file_operations)) { + ret = -EBADF; goto out_fput; + } info = MQUEUE_I(inode); audit_inode(NULL, filp->f_path.dentry); - if (unlikely(!(filp->f_mode & FMODE_WRITE))) + if (unlikely(!(filp->f_mode & FMODE_WRITE))) { + ret = -EBADF; goto out_fput; + } if (unlikely(msg_len > info->attr.mq_msgsize)) { ret = -EMSGSIZE; @@ -907,9 +902,6 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, if (filp->f_flags & O_NONBLOCK) { spin_unlock(&info->lock); ret = -EAGAIN; - } else if (unlikely(timeout < 0)) { - spin_unlock(&info->lock); - ret = timeout; } else { wait.task = current; wait.msg = (void *) msg_ptr; @@ -942,38 +934,42 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr, size_t, msg_len, unsigned int __user *, u_msg_prio, const struct timespec __user *, u_abs_timeout) { - long timeout; ssize_t ret; struct msg_msg *msg_ptr; struct file *filp; struct inode *inode; struct mqueue_inode_info *info; struct ext_wait_queue wait; - struct timespec ts, *p = NULL; + ktime_t expires, *timeout = NULL; + struct timespec ts; if (u_abs_timeout) { - if (copy_from_user(&ts, u_abs_timeout, - sizeof(struct timespec))) - return -EFAULT; - p = &ts; + int res = prepare_timeout(u_abs_timeout, &expires, &ts); + if (res) + return res; + timeout = &expires; } - audit_mq_sendrecv(mqdes, msg_len, 0, p); - timeout = prepare_timeout(p); + audit_mq_sendrecv(mqdes, msg_len, 0, timeout ? &ts : NULL); - ret = -EBADF; filp = fget(mqdes); - if (unlikely(!filp)) + if (unlikely(!filp)) { + ret = -EBADF; goto out; + } inode = filp->f_path.dentry->d_inode; - if (unlikely(filp->f_op != &mqueue_file_operations)) + if (unlikely(filp->f_op != &mqueue_file_operations)) { + ret = -EBADF; goto out_fput; + } info = MQUEUE_I(inode); audit_inode(NULL, filp->f_path.dentry); - if (unlikely(!(filp->f_mode & FMODE_READ))) + if (unlikely(!(filp->f_mode & FMODE_READ))) { + ret = -EBADF; goto out_fput; + } /* checks if buffer is big enough */ if (unlikely(msg_len < info->attr.mq_msgsize)) { @@ -986,11 +982,6 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr, if (filp->f_flags & O_NONBLOCK) { spin_unlock(&info->lock); ret = -EAGAIN; - msg_ptr = NULL; - } else if (unlikely(timeout < 0)) { - spin_unlock(&info->lock); - ret = timeout; - msg_ptr = NULL; } else { wait.task = current; wait.state = STATE_NONE; @@ -1063,13 +1054,14 @@ SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes, /* create the notify skb */ nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL); - ret = -ENOMEM; - if (!nc) + if (!nc) { + ret = -ENOMEM; goto out; - ret = -EFAULT; + } if (copy_from_user(nc->data, notification.sigev_value.sival_ptr, NOTIFY_COOKIE_LEN)) { + ret = -EFAULT; goto out; } @@ -1078,9 +1070,10 @@ SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes, /* and attach it to the socket */ retry: filp = fget(notification.sigev_signo); - ret = -EBADF; - if (!filp) + if (!filp) { + ret = -EBADF; goto out; + } sock = netlink_getsockbyfilp(filp); fput(filp); if (IS_ERR(sock)) { @@ -1092,7 +1085,7 @@ retry: timeo = MAX_SCHEDULE_TIMEOUT; ret = netlink_attachskb(sock, nc, &timeo, NULL); if (ret == 1) - goto retry; + goto retry; if (ret) { sock = NULL; nc = NULL; @@ -1101,14 +1094,17 @@ retry: } } - ret = -EBADF; filp = fget(mqdes); - if (!filp) + if (!filp) { + ret = -EBADF; goto out; + } inode = filp->f_path.dentry->d_inode; - if (unlikely(filp->f_op != &mqueue_file_operations)) + if (unlikely(filp->f_op != &mqueue_file_operations)) { + ret = -EBADF; goto out_fput; + } info = MQUEUE_I(inode); ret = 0; @@ -1171,14 +1167,17 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes, return -EINVAL; } - ret = -EBADF; filp = fget(mqdes); - if (!filp) + if (!filp) { + ret = -EBADF; goto out; + } inode = filp->f_path.dentry->d_inode; - if (unlikely(filp->f_op != &mqueue_file_operations)) + if (unlikely(filp->f_op != &mqueue_file_operations)) { + ret = -EBADF; goto out_fput; + } info = MQUEUE_I(inode); spin_lock(&info->lock); @@ -1225,9 +1224,8 @@ static const struct file_operations mqueue_file_operations = { static const struct super_operations mqueue_super_ops = { .alloc_inode = mqueue_alloc_inode, .destroy_inode = mqueue_destroy_inode, + .evict_inode = mqueue_evict_inode, .statfs = simple_statfs, - .delete_inode = mqueue_delete_inode, - .drop_inode = generic_delete_inode, }; static struct file_system_type mqueue_fs_type = { @@ -1272,7 +1270,7 @@ static int __init init_mqueue_fs(void) if (mqueue_inode_cachep == NULL) return -ENOMEM; - /* ignore failues - they are not fatal */ + /* ignore failures - they are not fatal */ mq_sysctl_table = mq_register_sysctl_table(); error = register_filesystem(&mqueue_fs_type); diff --git a/ipc/msg.c b/ipc/msg.c index af42ef8900a6..747b65507a91 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -23,7 +23,6 @@ */ #include <linux/capability.h> -#include <linux/slab.h> #include <linux/msg.h> #include <linux/spinlock.h> #include <linux/init.h> @@ -346,19 +345,19 @@ copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version) out.msg_rtime = in->msg_rtime; out.msg_ctime = in->msg_ctime; - if (in->msg_cbytes > USHORT_MAX) - out.msg_cbytes = USHORT_MAX; + if (in->msg_cbytes > USHRT_MAX) + out.msg_cbytes = USHRT_MAX; else out.msg_cbytes = in->msg_cbytes; out.msg_lcbytes = in->msg_cbytes; - if (in->msg_qnum > USHORT_MAX) - out.msg_qnum = USHORT_MAX; + if (in->msg_qnum > USHRT_MAX) + out.msg_qnum = USHRT_MAX; else out.msg_qnum = in->msg_qnum; - if (in->msg_qbytes > USHORT_MAX) - out.msg_qbytes = USHORT_MAX; + if (in->msg_qbytes > USHRT_MAX) + out.msg_qbytes = USHRT_MAX; else out.msg_qbytes = in->msg_qbytes; out.msg_lqbytes = in->msg_qbytes; diff --git a/ipc/sem.c b/ipc/sem.c index dbef95b15941..0e0d49bbb867 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -3,56 +3,6 @@ * Copyright (C) 1992 Krishna Balasubramanian * Copyright (C) 1995 Eric Schenk, Bruno Haible * - * IMPLEMENTATION NOTES ON CODE REWRITE (Eric Schenk, January 1995): - * This code underwent a massive rewrite in order to solve some problems - * with the original code. In particular the original code failed to - * wake up processes that were waiting for semval to go to 0 if the - * value went to 0 and was then incremented rapidly enough. In solving - * this problem I have also modified the implementation so that it - * processes pending operations in a FIFO manner, thus give a guarantee - * that processes waiting for a lock on the semaphore won't starve - * unless another locking process fails to unlock. - * In addition the following two changes in behavior have been introduced: - * - The original implementation of semop returned the value - * last semaphore element examined on success. This does not - * match the manual page specifications, and effectively - * allows the user to read the semaphore even if they do not - * have read permissions. The implementation now returns 0 - * on success as stated in the manual page. - * - There is some confusion over whether the set of undo adjustments - * to be performed at exit should be done in an atomic manner. - * That is, if we are attempting to decrement the semval should we queue - * up and wait until we can do so legally? - * The original implementation attempted to do this. - * The current implementation does not do so. This is because I don't - * think it is the right thing (TM) to do, and because I couldn't - * see a clean way to get the old behavior with the new design. - * The POSIX standard and SVID should be consulted to determine - * what behavior is mandated. - * - * Further notes on refinement (Christoph Rohland, December 1998): - * - The POSIX standard says, that the undo adjustments simply should - * redo. So the current implementation is o.K. - * - The previous code had two flaws: - * 1) It actively gave the semaphore to the next waiting process - * sleeping on the semaphore. Since this process did not have the - * cpu this led to many unnecessary context switches and bad - * performance. Now we only check which process should be able to - * get the semaphore and if this process wants to reduce some - * semaphore value we simply wake it up without doing the - * operation. So it has to try to get it later. Thus e.g. the - * running process may reacquire the semaphore during the current - * time slice. If it only waits for zero or increases the semaphore, - * we do the operation in advance and wake it up. - * 2) It did not wake up all zero waiting processes. We try to do - * better but only get the semops right which only wait for zero or - * increase. If there are decrement operations in the operations - * array we do the same as before. - * - * With the incarnation of O(1) scheduler, it becomes unnecessary to perform - * check/retry algorithm for waking up blocked processes as the new scheduler - * is better at handling thread switch than the old one. - * * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com> * * SMP-threaded, sysctl's added @@ -61,6 +11,8 @@ * (c) 2001 Red Hat Inc * Lockless wakeup * (c) 2003 Manfred Spraul <manfred@colorfullife.com> + * Further wakeup optimizations, documentation + * (c) 2010 Manfred Spraul <manfred@colorfullife.com> * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> @@ -68,6 +20,57 @@ * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> + * + * Implementation notes: (May 2010) + * This file implements System V semaphores. + * + * User space visible behavior: + * - FIFO ordering for semop() operations (just FIFO, not starvation + * protection) + * - multiple semaphore operations that alter the same semaphore in + * one semop() are handled. + * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and + * SETALL calls. + * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO. + * - undo adjustments at process exit are limited to 0..SEMVMX. + * - namespace are supported. + * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing + * to /proc/sys/kernel/sem. + * - statistics about the usage are reported in /proc/sysvipc/sem. + * + * Internals: + * - scalability: + * - all global variables are read-mostly. + * - semop() calls and semctl(RMID) are synchronized by RCU. + * - most operations do write operations (actually: spin_lock calls) to + * the per-semaphore array structure. + * Thus: Perfect SMP scaling between independent semaphore arrays. + * If multiple semaphores in one array are used, then cache line + * trashing on the semaphore array spinlock will limit the scaling. + * - semncnt and semzcnt are calculated on demand in count_semncnt() and + * count_semzcnt() + * - the task that performs a successful semop() scans the list of all + * sleeping tasks and completes any pending operations that can be fulfilled. + * Semaphores are actively given to waiting tasks (necessary for FIFO). + * (see update_queue()) + * - To improve the scalability, the actual wake-up calls are performed after + * dropping all locks. (see wake_up_sem_queue_prepare(), + * wake_up_sem_queue_do()) + * - All work is done by the waker, the woken up task does not have to do + * anything - not even acquiring a lock or dropping a refcount. + * - A woken up task may not even touch the semaphore array anymore, it may + * have been destroyed already by a semctl(RMID). + * - The synchronizations between wake-ups due to a timeout/signal and a + * wake-up due to a completed semaphore operation is achieved by using an + * intermediate state (IN_WAKEUP). + * - UNDO values are stored in an array (one per process and per + * semaphore array, lazily allocated). For backwards compatibility, multiple + * modes for the UNDO variables are supported (per process, per thread) + * (see copy_semundo, CLONE_SYSVSEM) + * - There are two lists of the pending operations: a per-array list + * and per-semaphore list (stored in the array). This allows to achieve FIFO + * ordering without always scanning all pending operations. + * The worst-case behavior is nevertheless O(N^2) for N wakeups. */ #include <linux/slab.h> @@ -381,7 +384,6 @@ static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops, sop--; } - sma->sem_otime = get_seconds(); return 0; out_of_range: @@ -404,25 +406,51 @@ undo: return result; } -/* - * Wake up a process waiting on the sem queue with a given error. - * The queue is invalid (may not be accessed) after the function returns. +/** wake_up_sem_queue_prepare(q, error): Prepare wake-up + * @q: queue entry that must be signaled + * @error: Error value for the signal + * + * Prepare the wake-up of the queue entry q. */ -static void wake_up_sem_queue(struct sem_queue *q, int error) +static void wake_up_sem_queue_prepare(struct list_head *pt, + struct sem_queue *q, int error) { - /* - * Hold preempt off so that we don't get preempted and have the - * wakee busy-wait until we're scheduled back on. We're holding - * locks here so it may not strictly be needed, however if the - * locks become preemptible then this prevents such a problem. - */ - preempt_disable(); + if (list_empty(pt)) { + /* + * Hold preempt off so that we don't get preempted and have the + * wakee busy-wait until we're scheduled back on. + */ + preempt_disable(); + } q->status = IN_WAKEUP; - wake_up_process(q->sleeper); - /* hands-off: q can disappear immediately after writing q->status. */ - smp_wmb(); - q->status = error; - preempt_enable(); + q->pid = error; + + list_add_tail(&q->simple_list, pt); +} + +/** + * wake_up_sem_queue_do(pt) - do the actual wake-up + * @pt: list of tasks to be woken up + * + * Do the actual wake-up. + * The function is called without any locks held, thus the semaphore array + * could be destroyed already and the tasks can disappear as soon as the + * status is set to the actual return code. + */ +static void wake_up_sem_queue_do(struct list_head *pt) +{ + struct sem_queue *q, *t; + int did_something; + + did_something = !list_empty(pt); + list_for_each_entry_safe(q, t, pt, simple_list) { + wake_up_process(q->sleeper); + /* q can disappear immediately after writing q->status. */ + smp_wmb(); + q->status = q->pid; + } + if (did_something) + preempt_enable(); } static void unlink_queue(struct sem_array *sma, struct sem_queue *q) @@ -434,22 +462,90 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q) sma->complex_count--; } +/** check_restart(sma, q) + * @sma: semaphore array + * @q: the operation that just completed + * + * update_queue is O(N^2) when it restarts scanning the whole queue of + * waiting operations. Therefore this function checks if the restart is + * really necessary. It is called after a previously waiting operation + * was completed. + */ +static int check_restart(struct sem_array *sma, struct sem_queue *q) +{ + struct sem *curr; + struct sem_queue *h; + + /* if the operation didn't modify the array, then no restart */ + if (q->alter == 0) + return 0; + + /* pending complex operations are too difficult to analyse */ + if (sma->complex_count) + return 1; + + /* we were a sleeping complex operation. Too difficult */ + if (q->nsops > 1) + return 1; + + curr = sma->sem_base + q->sops[0].sem_num; + + /* No-one waits on this queue */ + if (list_empty(&curr->sem_pending)) + return 0; + + /* the new semaphore value */ + if (curr->semval) { + /* It is impossible that someone waits for the new value: + * - q is a previously sleeping simple operation that + * altered the array. It must be a decrement, because + * simple increments never sleep. + * - The value is not 0, thus wait-for-zero won't proceed. + * - If there are older (higher priority) decrements + * in the queue, then they have observed the original + * semval value and couldn't proceed. The operation + * decremented to value - thus they won't proceed either. + */ + BUG_ON(q->sops[0].sem_op >= 0); + return 0; + } + /* + * semval is 0. Check if there are wait-for-zero semops. + * They must be the first entries in the per-semaphore simple queue + */ + h = list_first_entry(&curr->sem_pending, struct sem_queue, simple_list); + BUG_ON(h->nsops != 1); + BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num); + + /* Yes, there is a wait-for-zero semop. Restart */ + if (h->sops[0].sem_op == 0) + return 1; + + /* Again - no-one is waiting for the new value. */ + return 0; +} + /** * update_queue(sma, semnum): Look for tasks that can be completed. * @sma: semaphore array. * @semnum: semaphore that was modified. + * @pt: list head for the tasks that must be woken up. * * update_queue must be called after a semaphore in a semaphore array * was modified. If multiple semaphore were modified, then @semnum * must be set to -1. + * The tasks that must be woken up are added to @pt. The return code + * is stored in q->pid. + * The function return 1 if at least one semop was completed successfully. */ -static void update_queue(struct sem_array *sma, int semnum) +static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) { struct sem_queue *q; struct list_head *walk; struct list_head *pending_list; int offset; + int semop_completed = 0; /* if there are complex operations around, then knowing the semaphore * that was modified doesn't help us. Assume that multiple semaphores @@ -469,7 +565,7 @@ static void update_queue(struct sem_array *sma, int semnum) again: walk = pending_list->next; while (walk != pending_list) { - int error, alter; + int error, restart; q = (struct sem_queue *)((char *)walk - offset); walk = walk->next; @@ -494,22 +590,58 @@ again: unlink_queue(sma, q); - /* - * The next operation that must be checked depends on the type - * of the completed operation: - * - if the operation modified the array, then restart from the - * head of the queue and check for threads that might be - * waiting for the new semaphore values. - * - if the operation didn't modify the array, then just - * continue. - */ - alter = q->alter; - wake_up_sem_queue(q, error); - if (alter && !error) + if (error) { + restart = 0; + } else { + semop_completed = 1; + restart = check_restart(sma, q); + } + + wake_up_sem_queue_prepare(pt, q, error); + if (restart) goto again; } + return semop_completed; +} + +/** + * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue + * @sma: semaphore array + * @sops: operations that were performed + * @nsops: number of operations + * @otime: force setting otime + * @pt: list head of the tasks that must be woken up. + * + * do_smart_update() does the required called to update_queue, based on the + * actual changes that were performed on the semaphore array. + * Note that the function does not do the actual wake-up: the caller is + * responsible for calling wake_up_sem_queue_do(@pt). + * It is safe to perform this call after dropping all locks. + */ +static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, + int otime, struct list_head *pt) +{ + int i; + + if (sma->complex_count || sops == NULL) { + if (update_queue(sma, -1, pt)) + otime = 1; + goto done; + } + + for (i = 0; i < nsops; i++) { + if (sops[i].sem_op > 0 || + (sops[i].sem_op < 0 && + sma->sem_base[sops[i].sem_num].semval == 0)) + if (update_queue(sma, sops[i].sem_num, pt)) + otime = 1; + } +done: + if (otime) + sma->sem_otime = get_seconds(); } + /* The following counts are associated to each semaphore: * semncnt number of tasks waiting on semval being nonzero * semzcnt number of tasks waiting on semval being zero @@ -572,6 +704,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) struct sem_undo *un, *tu; struct sem_queue *q, *tq; struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); + struct list_head tasks; /* Free the existing undo structures for this semaphore set. */ assert_spin_locked(&sma->sem_perm.lock); @@ -585,15 +718,17 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) } /* Wake up all pending processes and let them fail with EIDRM. */ + INIT_LIST_HEAD(&tasks); list_for_each_entry_safe(q, tq, &sma->sem_pending, list) { unlink_queue(sma, q); - wake_up_sem_queue(q, -EIDRM); + wake_up_sem_queue_prepare(&tasks, q, -EIDRM); } /* Remove the semaphore set from the IDR */ sem_rmid(ns, sma); sem_unlock(sma); + wake_up_sem_queue_do(&tasks); ns->used_sems -= sma->sem_nsems; security_sem_free(sma); ipc_rcu_putref(sma); @@ -608,6 +743,8 @@ static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, { struct semid_ds out; + memset(&out, 0, sizeof(out)); + ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm); out.sem_otime = in->sem_otime; @@ -715,11 +852,13 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, ushort fast_sem_io[SEMMSL_FAST]; ushort* sem_io = fast_sem_io; int nsems; + struct list_head tasks; sma = sem_lock_check(ns, semid); if (IS_ERR(sma)) return PTR_ERR(sma); + INIT_LIST_HEAD(&tasks); nsems = sma->sem_nsems; err = -EACCES; @@ -807,7 +946,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, } sma->sem_ctime = get_seconds(); /* maybe some queued-up processes were waiting for this */ - update_queue(sma, -1); + do_smart_update(sma, NULL, 0, 0, &tasks); err = 0; goto out_unlock; } @@ -849,13 +988,15 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, curr->sempid = task_tgid_vnr(current); sma->sem_ctime = get_seconds(); /* maybe some queued-up processes were waiting for this */ - update_queue(sma, semnum); + do_smart_update(sma, NULL, 0, 0, &tasks); err = 0; goto out_unlock; } } out_unlock: sem_unlock(sma); + wake_up_sem_queue_do(&tasks); + out_free: if(sem_io != fast_sem_io) ipc_free(sem_io, sizeof(ushort)*nsems); @@ -1069,7 +1210,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) /* step 1: figure out the size of the semaphore array */ sma = sem_lock_check(ns, semid); if (IS_ERR(sma)) - return ERR_PTR(PTR_ERR(sma)); + return ERR_CAST(sma); nsems = sma->sem_nsems; sem_getref_and_unlock(sma); @@ -1117,6 +1258,33 @@ out: return un; } + +/** + * get_queue_result - Retrieve the result code from sem_queue + * @q: Pointer to queue structure + * + * Retrieve the return code from the pending queue. If IN_WAKEUP is found in + * q->status, then we must loop until the value is replaced with the final + * value: This may happen if a task is woken up by an unrelated event (e.g. + * signal) and in parallel the task is woken up by another task because it got + * the requested semaphores. + * + * The function can be called with or without holding the semaphore spinlock. + */ +static int get_queue_result(struct sem_queue *q) +{ + int error; + + error = q->status; + while (unlikely(error == IN_WAKEUP)) { + cpu_relax(); + error = q->status; + } + + return error; +} + + SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, unsigned, nsops, const struct timespec __user *, timeout) { @@ -1129,6 +1297,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, struct sem_queue queue; unsigned long jiffies_left = 0; struct ipc_namespace *ns; + struct list_head tasks; ns = current->nsproxy->ipc_ns; @@ -1177,6 +1346,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, } else un = NULL; + INIT_LIST_HEAD(&tasks); + sma = sem_lock_check(ns, semid); if (IS_ERR(sma)) { if (un) @@ -1225,7 +1396,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current)); if (error <= 0) { if (alter && error == 0) - update_queue(sma, (nsops == 1) ? sops[0].sem_num : -1); + do_smart_update(sma, sops, nsops, 1, &tasks); goto out_unlock_free; } @@ -1267,15 +1438,18 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, else schedule(); - error = queue.status; - while(unlikely(error == IN_WAKEUP)) { - cpu_relax(); - error = queue.status; - } + error = get_queue_result(&queue); if (error != -EINTR) { /* fast path: update_queue already obtained all requested - * resources */ + * resources. + * Perform a smp_mb(): User space could assume that semop() + * is a memory barrier: Without the mb(), the cpu could + * speculatively read in user space stale data that was + * overwritten by the previous owner of the semaphore. + */ + smp_mb(); + goto out_free; } @@ -1285,10 +1459,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, goto out_free; } + error = get_queue_result(&queue); + /* * If queue.status != -EINTR we are woken up by another process */ - error = queue.status; + if (error != -EINTR) { goto out_unlock_free; } @@ -1302,6 +1478,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, out_unlock_free: sem_unlock(sma); + + wake_up_sem_queue_do(&tasks); out_free: if(sops != fast_sops) kfree(sops); @@ -1362,6 +1540,7 @@ void exit_sem(struct task_struct *tsk) for (;;) { struct sem_array *sma; struct sem_undo *un; + struct list_head tasks; int semid; int i; @@ -1425,10 +1604,11 @@ void exit_sem(struct task_struct *tsk) semaphore->sempid = task_tgid_vnr(current); } } - sma->sem_otime = get_seconds(); /* maybe some queued-up processes were waiting for this */ - update_queue(sma, -1); + INIT_LIST_HEAD(&tasks); + do_smart_update(sma, NULL, 0, 1, &tasks); sem_unlock(sma); + wake_up_sem_queue_do(&tasks); call_rcu(&un->rcu, free_un); } diff --git a/ipc/shm.c b/ipc/shm.c index 23256b855819..52ed77eb9713 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -273,16 +273,13 @@ static int shm_release(struct inode *ino, struct file *file) return 0; } -static int shm_fsync(struct file *file, struct dentry *dentry, int datasync) +static int shm_fsync(struct file *file, int datasync) { - int (*fsync) (struct file *, struct dentry *, int datasync); struct shm_file_data *sfd = shm_file_data(file); - int ret = -EINVAL; - fsync = sfd->file->f_op->fsync; - if (fsync) - ret = fsync(sfd->file, sfd->file->f_path.dentry, datasync); - return ret; + if (!sfd->file->f_op->fsync) + return -EINVAL; + return sfd->file->f_op->fsync(sfd->file, datasync); } static unsigned long shm_get_unmapped_area(struct file *file, @@ -764,8 +761,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) if (euid != shp->shm_perm.uid && euid != shp->shm_perm.cuid) goto out_unlock; - if (cmd == SHM_LOCK && - !current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) + if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) goto out_unlock; } diff --git a/ipc/syscall.c b/ipc/syscall.c new file mode 100644 index 000000000000..1d6f53f6b562 --- /dev/null +++ b/ipc/syscall.c @@ -0,0 +1,99 @@ +/* + * sys_ipc() is the old de-multiplexer for the SysV IPC calls. + * + * This is really horribly ugly, and new architectures should just wire up + * the individual syscalls instead. + */ +#include <linux/unistd.h> + +#ifdef __ARCH_WANT_SYS_IPC +#include <linux/errno.h> +#include <linux/ipc.h> +#include <linux/shm.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> + +SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second, + unsigned long, third, void __user *, ptr, long, fifth) +{ + int version, ret; + + version = call >> 16; /* hack for backward compatibility */ + call &= 0xffff; + + switch (call) { + case SEMOP: + return sys_semtimedop(first, (struct sembuf __user *)ptr, + second, NULL); + case SEMTIMEDOP: + return sys_semtimedop(first, (struct sembuf __user *)ptr, + second, + (const struct timespec __user *)fifth); + + case SEMGET: + return sys_semget(first, second, third); + case SEMCTL: { + union semun fourth; + if (!ptr) + return -EINVAL; + if (get_user(fourth.__pad, (void __user * __user *) ptr)) + return -EFAULT; + return sys_semctl(first, second, third, fourth); + } + + case MSGSND: + return sys_msgsnd(first, (struct msgbuf __user *) ptr, + second, third); + case MSGRCV: + switch (version) { + case 0: { + struct ipc_kludge tmp; + if (!ptr) + return -EINVAL; + + if (copy_from_user(&tmp, + (struct ipc_kludge __user *) ptr, + sizeof(tmp))) + return -EFAULT; + return sys_msgrcv(first, tmp.msgp, second, + tmp.msgtyp, third); + } + default: + return sys_msgrcv(first, + (struct msgbuf __user *) ptr, + second, fifth, third); + } + case MSGGET: + return sys_msgget((key_t) first, second); + case MSGCTL: + return sys_msgctl(first, second, (struct msqid_ds __user *)ptr); + + case SHMAT: + switch (version) { + default: { + unsigned long raddr; + ret = do_shmat(first, (char __user *)ptr, + second, &raddr); + if (ret) + return ret; + return put_user(raddr, (unsigned long __user *) third); + } + case 1: + /* + * This was the entry point for kernel-originating calls + * from iBCS2 in 2.2 days. + */ + return -EINVAL; + } + case SHMDT: + return sys_shmdt((char __user *)ptr); + case SHMGET: + return sys_shmget(first, second, third); + case SHMCTL: + return sys_shmctl(first, second, + (struct shmid_ds __user *) ptr); + default: + return -ENOSYS; + } +} +#endif diff --git a/ipc/util.c b/ipc/util.c index 79ce84e890f7..69a0cc13d966 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -124,8 +124,8 @@ void ipc_init_ids(struct ipc_ids *ids) ids->seq = 0; { int seq_limit = INT_MAX/SEQ_MULTIPLIER; - if (seq_limit > USHORT_MAX) - ids->seq_max = USHORT_MAX; + if (seq_limit > USHRT_MAX) + ids->seq_max = USHRT_MAX; else ids->seq_max = seq_limit; } |