Task Control Groups: basic task cgroup framework

Generic Process Control Groups -------------------------- There have recently been various proposals floating around for resource management/accounting and other task grouping subsystems in the kernel, including ResGroups, User BeanCounters, NSProxy cgroups, and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, or control other behaviour of the processes, and all implement this grouping in different ways. This patchset provides a framework for tracking and grouping processes into arbitrary "cgroups" and assigning arbitrary state to those groupings, in order to control the behaviour of the cgroup as an aggregate. The intention is that the various resource management and virtualization/cgroup efforts can also become task cgroup clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test e.g. the ResGroups CPU controller in conjunction with the BeanCounters memory controller, or use either of them as the resource-control portion of a virtual server system. - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel This patch: Add the main task cgroups framework - the cgroup filesystem, and the basic structures for tracking membership and associating subsystem state objects to tasks. Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Paul Menage <menage@google.com> 2007-10-18 23:39:30 -0700
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-19 11:53:36 -0700
commit: ddbcc7e8e50aefe467c01cac3dec71f118cd8ac2 (patch)
tree: 0881a031e669582f819d572339e955b04abfc3d2 /include/linux/cgroup.h
parent: 55a230aae650157720becc09cadb7d10efbf5013 (diff)
1 files changed, 214 insertions, 0 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
new file mode 100644
index 000000000000..60735dcf427a
--- /dev/null
+++ b/include/linux/cgroup.h
@@ -0,0 +1,214 @@
+#ifndef _LINUX_CGROUP_H
+#define _LINUX_CGROUP_H
+/*
+ *  cgroup interface
+ *
+ *  Copyright (C) 2003 BULL SA
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kref.h>
+#include <linux/cpumask.h>
+#include <linux/nodemask.h>
+#include <linux/rcupdate.h>
+
+#ifdef CONFIG_CGROUPS
+
+struct cgroupfs_root;
+struct cgroup_subsys;
+struct inode;
+
+extern int cgroup_init_early(void);
+extern int cgroup_init(void);
+extern void cgroup_init_smp(void);
+extern void cgroup_lock(void);
+extern void cgroup_unlock(void);
+
+/* Per-subsystem/per-cgroup state maintained by the system. */
+struct cgroup_subsys_state {
+	/* The cgroup that this subsystem is attached to. Useful
+	 * for subsystems that want to know about the cgroup
+	 * hierarchy structure */
+	struct cgroup *cgroup;
+
+	/* State maintained by the cgroup system to allow
+	 * subsystems to be "busy". Should be accessed via css_get()
+	 * and css_put() */
+
+	atomic_t refcnt;
+
+	unsigned long flags;
+};
+
+/* bits in struct cgroup_subsys_state flags field */
+enum {
+	CSS_ROOT, /* This CSS is the root of the subsystem */
+};
+
+/*
+ * Call css_get() to hold a reference on the cgroup;
+ *
+ */
+
+static inline void css_get(struct cgroup_subsys_state *css)
+{
+	/* We don't need to reference count the root state */
+	if (!test_bit(CSS_ROOT, &css->flags))
+		atomic_inc(&css->refcnt);
+}
+/*
+ * css_put() should be called to release a reference taken by
+ * css_get()
+ */
+
+static inline void css_put(struct cgroup_subsys_state *css)
+{
+	if (!test_bit(CSS_ROOT, &css->flags))
+		atomic_dec(&css->refcnt);
+}
+
+struct cgroup {
+	unsigned long flags;		/* "unsigned long" so bitops work */
+
+	/* count users of this cgroup. >0 means busy, but doesn't
+	 * necessarily indicate the number of tasks in the
+	 * cgroup */
+	atomic_t count;
+
+	/*
+	 * We link our 'sibling' struct into our parent's 'children'.
+	 * Our children link their 'sibling' into our 'children'.
+	 */
+	struct list_head sibling;	/* my parent's children */
+	struct list_head children;	/* my children */
+
+	struct cgroup *parent;	/* my parent */
+	struct dentry *dentry;	  	/* cgroup fs entry */
+
+	/* Private pointers for each registered subsystem */
+	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+	struct cgroupfs_root *root;
+	struct cgroup *top_cgroup;
+};
+
+/* struct cftype:
+ *
+ * The files in the cgroup filesystem mostly have a very simple read/write
+ * handling, some common function will take care of it. Nevertheless some cases
+ * (read tasks) are special and therefore I define this structure for every
+ * kind of file.
+ *
+ *
+ * When reading/writing to a file:
+ *	- the cgroup to use in file->f_dentry->d_parent->d_fsdata
+ *	- the 'cftype' of the file is file->f_dentry->d_fsdata
+ */
+
+#define MAX_CFTYPE_NAME 64
+struct cftype {
+	/* By convention, the name should begin with the name of the
+	 * subsystem, followed by a period */
+	char name[MAX_CFTYPE_NAME];
+	int private;
+	int (*open) (struct inode *inode, struct file *file);
+	ssize_t (*read) (struct cgroup *cont, struct cftype *cft,
+			 struct file *file,
+			 char __user *buf, size_t nbytes, loff_t *ppos);
+	/*
+	 * read_uint() is a shortcut for the common case of returning a
+	 * single integer. Use it in place of read()
+	 */
+	u64 (*read_uint) (struct cgroup *cont, struct cftype *cft);
+	ssize_t (*write) (struct cgroup *cont, struct cftype *cft,
+			  struct file *file,
+			  const char __user *buf, size_t nbytes, loff_t *ppos);
+	int (*release) (struct inode *inode, struct file *file);
+};
+
+/* Add a new file to the given cgroup directory. Should only be
+ * called by subsystems from within a populate() method */
+int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
+		       const struct cftype *cft);
+
+/* Add a set of new files to the given cgroup directory. Should
+ * only be called by subsystems from within a populate() method */
+int cgroup_add_files(struct cgroup *cont,
+			struct cgroup_subsys *subsys,
+			const struct cftype cft[],
+			int count);
+
+int cgroup_is_removed(const struct cgroup *cont);
+
+int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
+
+/* Return true if the cgroup is a descendant of the current cgroup */
+int cgroup_is_descendant(const struct cgroup *cont);
+
+/* Control Group subsystem type. See Documentation/cgroups.txt for details */
+
+struct cgroup_subsys {
+	struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
+						  struct cgroup *cont);
+	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cont);
+	int (*can_attach)(struct cgroup_subsys *ss,
+			  struct cgroup *cont, struct task_struct *tsk);
+	void (*attach)(struct cgroup_subsys *ss, struct cgroup *cont,
+			struct cgroup *old_cont, struct task_struct *tsk);
+	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
+	void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
+	int (*populate)(struct cgroup_subsys *ss,
+			struct cgroup *cont);
+	void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
+	int subsys_id;
+	int active;
+	int early_init;
+#define MAX_CGROUP_TYPE_NAMELEN 32
+	const char *name;
+
+	/* Protected by RCU */
+	struct cgroupfs_root *root;
+
+	struct list_head sibling;
+
+	void *private;
+};
+
+#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+
+static inline struct cgroup_subsys_state *cgroup_subsys_state(
+	struct cgroup *cont, int subsys_id)
+{
+	return cont->subsys[subsys_id];
+}
+
+static inline struct cgroup_subsys_state *task_subsys_state(
+	struct task_struct *task, int subsys_id)
+{
+	return rcu_dereference(task->cgroups.subsys[subsys_id]);
+}
+
+static inline struct cgroup* task_cgroup(struct task_struct *task,
+					       int subsys_id)
+{
+	return task_subsys_state(task, subsys_id)->cgroup;
+}
+
+int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
+
+#else /* !CONFIG_CGROUPS */
+
+static inline int cgroup_init_early(void) { return 0; }
+static inline int cgroup_init(void) { return 0; }
+static inline void cgroup_init_smp(void) {}
+
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
+
+#endif /* !CONFIG_CGROUPS */
+
+#endif /* _LINUX_CGROUP_H */
author	Paul Menage <menage@google.com>	2007-10-18 23:39:30 -0700
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-19 11:53:36 -0700
commit	ddbcc7e8e50aefe467c01cac3dec71f118cd8ac2 (patch)
tree	0881a031e669582f819d572339e955b04abfc3d2 /include/linux/cgroup.h
parent	55a230aae650157720becc09cadb7d10efbf5013 (diff)