Merge branch 'for-3.15-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup into for-3.16

Pull to receive e37a06f10994 ("cgroup: fix the retry path of cgroup_mount()") to avoid unnecessary conflicts with planned cgroup_tree_mutex removal and also to be able to remove the temp fix added by 36c38fb7144a ("blkcg: use trylock on blkcg_pol_mutex in blkcg_reset_stats()") afterwards. Signed-off-by: Tejun Heo <tj@kernel.org>
author: Tejun Heo <tj@kernel.org> 2014-05-13 11:30:04 -0400
committer: Tejun Heo <tj@kernel.org> 2014-05-13 11:30:04 -0400
commit: f21a4f7594a122dcaabc08ce03bfb63fdc34de1b (patch)
tree: bdce189dc55f875e5466c31374c52b48f4ba4db0
parent: d39ea871c3c1269e2852ea096fff492ea034df8a (diff)
parent: 36e9d2ebcc15d029b33f42a36146ab5a5bcfcfe7 (diff)
5 files changed, 238 insertions, 112 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e4a4145926f6..1039fb9ff5f5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -451,7 +451,20 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 	struct blkcg_gq *blkg;
 	int i;
 
-	mutex_lock(&blkcg_pol_mutex);
+	/*
+	 * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex
+	 * which ends up putting cgroup's internal cgroup_tree_mutex under
+	 * it; however, cgroup_tree_mutex is nested above cgroup file
+	 * active protection and grabbing blkcg_pol_mutex from a cgroup
+	 * file operation creates a possible circular dependency.  cgroup
+	 * internal locking is planned to go through further simplification
+	 * and this issue should go away soon.  For now, let's trylock
+	 * blkcg_pol_mutex and restart the write on failure.
+	 *
+	 * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com
+	 */
+	if (!mutex_trylock(&blkcg_pol_mutex))
+		return restart_syscall();
 	spin_lock_irq(&blkcg->lock);
 
 	/*
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 6ab3ee5d4a14..bde44618d8c2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -521,6 +521,7 @@ struct cftype {
 };
 
 extern struct cgroup_root cgrp_dfl_root;
+extern struct css_set init_css_set;
 
 static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
 {
@@ -751,6 +752,20 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
 	return task_css_check(task, subsys_id, false);
 }
 
+/**
+ * task_css_is_root - test whether a task belongs to the root css
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * Test whether @task belongs to the root css on the specified subsystem.
+ * May be invoked in any context.
+ */
+static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
+{
+	return task_css_check(task, subsys_id, true) ==
+		init_css_set.subsys[subsys_id];
+}
+
 static inline struct cgroup *task_cgroup(struct task_struct *task,
 					 int subsys_id)
 {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 07815ef7b1f6..9db1a9629a5c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -439,7 +439,7 @@ struct cgrp_cset_link {
  * reference-counted, to improve performance when child cgroups
  * haven't been created.
  */
-static struct css_set init_css_set = {
+struct css_set init_css_set = {
 	.refcount		= ATOMIC_INIT(1),
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2bc4a2256444..345628c78b5b 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,6 +21,7 @@
 #include <linux/uaccess.h>
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
+#include <linux/mutex.h>
 
 /*
  * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
@@ -42,9 +43,10 @@ enum freezer_state_flags {
 struct freezer {
 	struct cgroup_subsys_state	css;
 	unsigned int			state;
-	spinlock_t			lock;
 };
 
+static DEFINE_MUTEX(freezer_mutex);
+
 static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct freezer, css) : NULL;
@@ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (!freezer)
 		return ERR_PTR(-ENOMEM);
 
-	spin_lock_init(&freezer->lock);
 	return &freezer->css;
 }
 
@@ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
 	struct freezer *freezer = css_freezer(css);
 	struct freezer *parent = parent_freezer(freezer);
 
-	/*
-	 * The following double locking and freezing state inheritance
-	 * guarantee that @cgroup can never escape ancestors' freezing
-	 * states.  See css_for_each_descendant_pre() for details.
-	 */
-	if (parent)
-		spin_lock_irq(&parent->lock);
-	spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
+	mutex_lock(&freezer_mutex);
 
 	freezer->state |= CGROUP_FREEZER_ONLINE;
 
@@ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
 		atomic_inc(&system_freezing_cnt);
 	}
 
-	spin_unlock(&freezer->lock);
-	if (parent)
-		spin_unlock_irq(&parent->lock);
-
+	mutex_unlock(&freezer_mutex);
 	return 0;
 }
 
@@ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
 {
 	struct freezer *freezer = css_freezer(css);
 
-	spin_lock_irq(&freezer->lock);
+	mutex_lock(&freezer_mutex);
 
 	if (freezer->state & CGROUP_FREEZING)
 		atomic_dec(&system_freezing_cnt);
 
 	freezer->state = 0;
 
-	spin_unlock_irq(&freezer->lock);
+	mutex_unlock(&freezer_mutex);
 }
 
 static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
 	struct task_struct *task;
 	bool clear_frozen = false;
 
-	spin_lock_irq(&freezer->lock);
+	mutex_lock(&freezer_mutex);
 
 	/*
 	 * Make the new tasks conform to the current state of @new_css.
@@ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
 		}
 	}
 
-	spin_unlock_irq(&freezer->lock);
-
-	/*
-	 * Propagate FROZEN clearing upwards.  We may race with
-	 * update_if_frozen(), but as long as both work bottom-up, either
-	 * update_if_frozen() sees child's FROZEN cleared or we clear the
-	 * parent's FROZEN later.  No parent w/ !FROZEN children can be
-	 * left FROZEN.
-	 */
+	/* propagate FROZEN clearing upwards */
 	while (clear_frozen && (freezer = parent_freezer(freezer))) {
-		spin_lock_irq(&freezer->lock);
 		freezer->state &= ~CGROUP_FROZEN;
 		clear_frozen = freezer->state & CGROUP_FREEZING;
-		spin_unlock_irq(&freezer->lock);
 	}
+
+	mutex_unlock(&freezer_mutex);
 }
 
 /**
@@ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task)
 {
 	struct freezer *freezer;
 
-	rcu_read_lock();
-	freezer = task_freezer(task);
-
 	/*
 	 * The root cgroup is non-freezable, so we can skip locking the
 	 * freezer.  This is safe regardless of race with task migration.
@@ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task)
 	 * to do.  If we lost and root is the new cgroup, noop is still the
 	 * right thing to do.
 	 */
-	if (!parent_freezer(freezer))
-		goto out;
+	if (task_css_is_root(task, freezer_cgrp_id))
+		return;
 
-	/*
-	 * Grab @freezer->lock and freeze @task after verifying @task still
-	 * belongs to @freezer and it's freezing.  The former is for the
-	 * case where we have raced against task migration and lost and
-	 * @task is already in a different cgroup which may not be frozen.
-	 * This isn't strictly necessary as freeze_task() is allowed to be
-	 * called spuriously but let's do it anyway for, if nothing else,
-	 * documentation.
-	 */
-	spin_lock_irq(&freezer->lock);
-	if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
+	mutex_lock(&freezer_mutex);
+	rcu_read_lock();
+
+	freezer = task_freezer(task);
+	if (freezer->state & CGROUP_FREEZING)
 		freeze_task(task);
-	spin_unlock_irq(&freezer->lock);
-out:
+
 	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
 }
 
 /**
@@ -281,22 +255,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
 	struct css_task_iter it;
 	struct task_struct *task;
 
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	spin_lock_irq(&freezer->lock);
+	lockdep_assert_held(&freezer_mutex);
 
 	if (!(freezer->state & CGROUP_FREEZING) ||
 	    (freezer->state & CGROUP_FROZEN))
-		goto out_unlock;
+		return;
 
 	/* are all (live) children frozen? */
+	rcu_read_lock();
 	css_for_each_child(pos, css) {
 		struct freezer *child = css_freezer(pos);
 
 		if ((child->state & CGROUP_FREEZER_ONLINE) &&
-		    !(child->state & CGROUP_FROZEN))
-			goto out_unlock;
+		    !(child->state & CGROUP_FROZEN)) {
+			rcu_read_unlock();
+			return;
+		}
 	}
+	rcu_read_unlock();
 
 	/* are all tasks frozen? */
 	css_task_iter_start(css, &it);
@@ -317,21 +293,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
 	freezer->state |= CGROUP_FROZEN;
 out_iter_end:
 	css_task_iter_end(&it);
-out_unlock:
-	spin_unlock_irq(&freezer->lock);
 }
 
 static int freezer_read(struct seq_file *m, void *v)
 {
 	struct cgroup_subsys_state *css = seq_css(m), *pos;
 
+	mutex_lock(&freezer_mutex);
 	rcu_read_lock();
 
 	/* update states bottom-up */
-	css_for_each_descendant_post(pos, css)
+	css_for_each_descendant_post(pos, css) {
+		if (!css_tryget(pos))
+			continue;
+		rcu_read_unlock();
+
 		update_if_frozen(pos);
 
+		rcu_read_lock();
+		css_put(pos);
+	}
+
 	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
 
 	seq_puts(m, freezer_state_strs(css_freezer(css)->state));
 	seq_putc(m, '\n');
@@ -373,7 +357,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
 				unsigned int state)
 {
 	/* also synchronizes against task migration, see freezer_attach() */
-	lockdep_assert_held(&freezer->lock);
+	lockdep_assert_held(&freezer_mutex);
 
 	if (!(freezer->state & CGROUP_FREEZER_ONLINE))
 		return;
@@ -414,31 +398,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
 	 * descendant will try to inherit its parent's FREEZING state as
 	 * CGROUP_FREEZING_PARENT.
 	 */
+	mutex_lock(&freezer_mutex);
 	rcu_read_lock();
 	css_for_each_descendant_pre(pos, &freezer->css) {
 		struct freezer *pos_f = css_freezer(pos);
 		struct freezer *parent = parent_freezer(pos_f);
 
-		spin_lock_irq(&pos_f->lock);
+		if (!css_tryget(pos))
+			continue;
+		rcu_read_unlock();
 
-		if (pos_f == freezer) {
+		if (pos_f == freezer)
 			freezer_apply_state(pos_f, freeze,
 					    CGROUP_FREEZING_SELF);
-		} else {
-			/*
-			 * Our update to @parent->state is already visible
-			 * which is all we need.  No need to lock @parent.
-			 * For more info on synchronization, see
-			 * freezer_post_create().
-			 */
+		else
 			freezer_apply_state(pos_f,
 					    parent->state & CGROUP_FREEZING,
 					    CGROUP_FREEZING_PARENT);
-		}
 
-		spin_unlock_irq(&pos_f->lock);
+		rcu_read_lock();
+		css_put(pos);
 	}
 	rcu_read_unlock();
+	mutex_unlock(&freezer_mutex);
 }
 
 static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 8365909f5f8c..9134dbf70d3e 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -306,57 +306,138 @@ static int devcgroup_seq_show(struct seq_file *m, void *v)
 }
 
 /**
- * may_access - verifies if a new exception is part of what is allowed
- *		by a dev cgroup based on the default policy +
- *		exceptions. This is used to make sure a child cgroup
- *		won't have more privileges than its parent or to
- *		verify if a certain access is allowed.
- * @dev_cgroup: dev cgroup to be tested against
- * @refex: new exception
- * @behavior: behavior of the exception
+ * match_exception	- iterates the exception list trying to find a complete match
+ * @exceptions: list of exceptions
+ * @type: device type (DEV_BLOCK or DEV_CHAR)
+ * @major: device file major number, ~0 to match all
+ * @minor: device file minor number, ~0 to match all
+ * @access: permission mask (ACC_READ, ACC_WRITE, ACC_MKNOD)
+ *
+ * It is considered a complete match if an exception is found that will
+ * contain the entire range of provided parameters.
+ *
+ * Return: true in case it matches an exception completely
  */
-static bool may_access(struct dev_cgroup *dev_cgroup,
-		       struct dev_exception_item *refex,
-		       enum devcg_behavior behavior)
+static bool match_exception(struct list_head *exceptions, short type,
+			    u32 major, u32 minor, short access)
 {
 	struct dev_exception_item *ex;
-	bool match = false;
 
-	rcu_lockdep_assert(rcu_read_lock_held() ||
-			   lockdep_is_held(&devcgroup_mutex),
-			   "device_cgroup::may_access() called without proper synchronization");
+	list_for_each_entry_rcu(ex, exceptions, list) {
+		if ((type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
+			continue;
+		if ((type & DEV_CHAR) && !(ex->type & DEV_CHAR))
+			continue;
+		if (ex->major != ~0 && ex->major != major)
+			continue;
+		if (ex->minor != ~0 && ex->minor != minor)
+			continue;
+		/* provided access cannot have more than the exception rule */
+		if (access & (~ex->access))
+			continue;
+		return true;
+	}
+	return false;
+}
+
+/**
+ * match_exception_partial - iterates the exception list trying to find a partial match
+ * @exceptions: list of exceptions
+ * @type: device type (DEV_BLOCK or DEV_CHAR)
+ * @major: device file major number, ~0 to match all
+ * @minor: device file minor number, ~0 to match all
+ * @access: permission mask (ACC_READ, ACC_WRITE, ACC_MKNOD)
+ *
+ * It is considered a partial match if an exception's range is found to
+ * contain *any* of the devices specified by provided parameters. This is
+ * used to make sure no extra access is being granted that is forbidden by
+ * any of the exception list.
+ *
+ * Return: true in case the provided range mat matches an exception completely
+ */
+static bool match_exception_partial(struct list_head *exceptions, short type,
+				    u32 major, u32 minor, short access)
+{
+	struct dev_exception_item *ex;
 
-	list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
-		if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
+	list_for_each_entry_rcu(ex, exceptions, list) {
+		if ((type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
 			continue;
-		if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR))
+		if ((type & DEV_CHAR) && !(ex->type & DEV_CHAR))
 			continue;
-		if (ex->major != ~0 && ex->major != refex->major)
+		/*
+		 * We must be sure that both the exception and the provided
+		 * range aren't masking all devices
+		 */
+		if (ex->major != ~0 && major != ~0 && ex->major != major)
 			continue;
-		if (ex->minor != ~0 && ex->minor != refex->minor)
+		if (ex->minor != ~0 && minor != ~0 && ex->minor != minor)
 			continue;
-		if (refex->access & (~ex->access))
+		/*
+		 * In order to make sure the provided range isn't matching
+		 * an exception, all its access bits shouldn't match the
+		 * exception's access bits
+		 */
+		if (!(access & ex->access))
 			continue;
-		match = true;
-		break;
+		return true;
 	}
+	return false;
+}
+
+/**
+ * verify_new_ex - verifies if a new exception is allowed by parent cgroup's permissions
+ * @dev_cgroup: dev cgroup to be tested against
+ * @refex: new exception
+ * @behavior: behavior of the exception's dev_cgroup
+ *
+ * This is used to make sure a child cgroup won't have more privileges
+ * than its parent
+ */
+static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
+		          struct dev_exception_item *refex,
+		          enum devcg_behavior behavior)
+{
+	bool match = false;
+
+	rcu_lockdep_assert(rcu_read_lock_held() ||
+			   lockdep_is_held(&devcgroup_mutex),
+			   "device_cgroup:verify_new_ex called without proper synchronization");
 
 	if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
 		if (behavior == DEVCG_DEFAULT_ALLOW) {
-			/* the exception will deny access to certain devices */
+			/*
+			 * new exception in the child doesn't matter, only
+			 * adding extra restrictions
+			 */ 
 			return true;
 		} else {
-			/* the exception will allow access to certain devices */
+			/*
+			 * new exception in the child will add more devices
+			 * that can be acessed, so it can't match any of
+			 * parent's exceptions, even slightly
+			 */ 
+			match = match_exception_partial(&dev_cgroup->exceptions,
+							refex->type,
+							refex->major,
+							refex->minor,
+							refex->access);
+
 			if (match)
-				/*
-				 * a new exception allowing access shouldn't
-				 * match an parent's exception
-				 */
 				return false;
 			return true;
 		}
 	} else {
-		/* only behavior == DEVCG_DEFAULT_DENY allowed here */
+		/*
+		 * Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore
+		 * the new exception will add access to more devices and must
+		 * be contained completely in an parent's exception to be
+		 * allowed
+		 */
+		match = match_exception(&dev_cgroup->exceptions, refex->type,
+					refex->major, refex->minor,
+					refex->access);
+
 		if (match)
 			/* parent has an exception that matches the proposed */
 			return true;
@@ -378,7 +459,38 @@ static int parent_has_perm(struct dev_cgroup *childcg,
 
 	if (!parent)
 		return 1;
-	return may_access(parent, ex, childcg->behavior);
+	return verify_new_ex(parent, ex, childcg->behavior);
+}
+
+/**
+ * parent_allows_removal - verify if it's ok to remove an exception
+ * @childcg: child cgroup from where the exception will be removed
+ * @ex: exception being removed
+ *
+ * When removing an exception in cgroups with default ALLOW policy, it must
+ * be checked if removing it will give the child cgroup more access than the
+ * parent.
+ *
+ * Return: true if it's ok to remove exception, false otherwise
+ */
+static bool parent_allows_removal(struct dev_cgroup *childcg,
+				  struct dev_exception_item *ex)
+{
+	struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css));
+
+	if (!parent)
+		return true;
+
+	/* It's always allowed to remove access to devices */
+	if (childcg->behavior == DEVCG_DEFAULT_DENY)
+		return true;
+
+	/*
+	 * Make sure you're not removing part or a whole exception existing in
+	 * the parent cgroup
+	 */
+	return !match_exception_partial(&parent->exceptions, ex->type,
+					ex->major, ex->minor, ex->access);
 }
 
 /**
@@ -616,17 +728,21 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 
 	switch (filetype) {
 	case DEVCG_ALLOW:
-		if (!parent_has_perm(devcgroup, &ex))
-			return -EPERM;
 		/*
 		 * If the default policy is to allow by default, try to remove
 		 * an matching exception instead. And be silent about it: we
 		 * don't want to break compatibility
 		 */
 		if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
+			/* Check if the parent allows removing it first */
+			if (!parent_allows_removal(devcgroup, &ex))
+				return -EPERM;
 			dev_exception_rm(devcgroup, &ex);
-			return 0;
+			break;
 		}
+
+		if (!parent_has_perm(devcgroup, &ex))
+			return -EPERM;
 		rc = dev_exception_add(devcgroup, &ex);
 		break;
 	case DEVCG_DENY:
@@ -704,18 +820,18 @@ static int __devcgroup_check_permission(short type, u32 major, u32 minor,
 				        short access)
 {
 	struct dev_cgroup *dev_cgroup;
-	struct dev_exception_item ex;
-	int rc;
-
-	memset(&ex, 0, sizeof(ex));
-	ex.type = type;
-	ex.major = major;
-	ex.minor = minor;
-	ex.access = access;
+	bool rc;
 
 	rcu_read_lock();
 	dev_cgroup = task_devcgroup(current);
-	rc = may_access(dev_cgroup, &ex, dev_cgroup->behavior);
+	if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW)
+		/* Can't match any of the exceptions, even partially */
+		rc = !match_exception_partial(&dev_cgroup->exceptions,
+					      type, major, minor, access);
+	else
+		/* Need to match completely one exception to be allowed */
+		rc = match_exception(&dev_cgroup->exceptions, type, major,
+				     minor, access);
 	rcu_read_unlock();
 
 	if (!rc)
author	Tejun Heo <tj@kernel.org>	2014-05-13 11:30:04 -0400
committer	Tejun Heo <tj@kernel.org>	2014-05-13 11:30:04 -0400
commit	f21a4f7594a122dcaabc08ce03bfb63fdc34de1b (patch)
tree	bdce189dc55f875e5466c31374c52b48f4ba4db0
parent	d39ea871c3c1269e2852ea096fff492ea034df8a (diff)
parent	36e9d2ebcc15d029b33f42a36146ab5a5bcfcfe7 (diff)