From 2b4c7afe79a8a0a0e05edeaded5653c190153f9b Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Sun, 15 May 2016 22:47:39 -0400
Subject: audit: fixup: log on errors from filter user rules

In commit 724e4fcc the intention was to pass any errors back from
audit_filter_user_rules() to audit_filter_user().  Add that code.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditfilter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 94ca7b1e5e7e..8a8aa3fbc8d8 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1339,8 +1339,8 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type,
 			break;
 		}
 
-		if (!result)
-			return 0;
+		if (result <= 0)
+			return result;
 	}
 	switch (rule->action) {
 	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
-- 
cgit v1.2.3


From e788892ba3cc71d385b75895f7a375fbc659ce86 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 2 Jun 2016 23:24:15 +0200
Subject: cpufreq: governor: Get rid of governor events

The design of the cpufreq governor API is not very straightforward,
as struct cpufreq_governor provides only one callback to be invoked
from different code paths for different purposes.  The purpose it is
invoked for is determined by its second "event" argument, causing it
to act as a "callback multiplexer" of sorts.

Unfortunately, that leads to extra complexity in governors, some of
which implement the ->governor() callback as a switch statement
that simply checks the event argument and invokes a separate function
to handle that specific event.

That extra complexity can be eliminated by replacing the all-purpose
->governor() callback with a family of callbacks to carry out specific
governor operations: initialization and exit, start and stop and policy
limits updates.  That also turns out to reduce the code size too, so
do it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 kernel/sched/cpufreq_schedutil.c | 34 ++++++++--------------------------
 1 file changed, 8 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 14c4aa25cc45..fdcee3cf38fc 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -394,7 +394,7 @@ static int sugov_init(struct cpufreq_policy *policy)
 	return ret;
 }
 
-static int sugov_exit(struct cpufreq_policy *policy)
+static void sugov_exit(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy = policy->governor_data;
 	struct sugov_tunables *tunables = sg_policy->tunables;
@@ -412,7 +412,6 @@ static int sugov_exit(struct cpufreq_policy *policy)
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
-	return 0;
 }
 
 static int sugov_start(struct cpufreq_policy *policy)
@@ -444,7 +443,7 @@ static int sugov_start(struct cpufreq_policy *policy)
 	return 0;
 }
 
-static int sugov_stop(struct cpufreq_policy *policy)
+static void sugov_stop(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy = policy->governor_data;
 	unsigned int cpu;
@@ -456,10 +455,9 @@ static int sugov_stop(struct cpufreq_policy *policy)
 
 	irq_work_sync(&sg_policy->irq_work);
 	cancel_work_sync(&sg_policy->work);
-	return 0;
 }
 
-static int sugov_limits(struct cpufreq_policy *policy)
+static void sugov_limits(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy = policy->governor_data;
 
@@ -477,32 +475,16 @@ static int sugov_limits(struct cpufreq_policy *policy)
 	}
 
 	sg_policy->need_freq_update = true;
-	return 0;
-}
-
-int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
-{
-	if (event == CPUFREQ_GOV_POLICY_INIT) {
-		return sugov_init(policy);
-	} else if (policy->governor_data) {
-		switch (event) {
-		case CPUFREQ_GOV_POLICY_EXIT:
-			return sugov_exit(policy);
-		case CPUFREQ_GOV_START:
-			return sugov_start(policy);
-		case CPUFREQ_GOV_STOP:
-			return sugov_stop(policy);
-		case CPUFREQ_GOV_LIMITS:
-			return sugov_limits(policy);
-		}
-	}
-	return -EINVAL;
 }
 
 static struct cpufreq_governor schedutil_gov = {
 	.name = "schedutil",
-	.governor = sugov_governor,
 	.owner = THIS_MODULE,
+	.init = sugov_init,
+	.exit = sugov_exit,
+	.start = sugov_start,
+	.stop = sugov_stop,
+	.limits = sugov_limits,
 };
 
 static int __init sugov_module_init(void)
-- 
cgit v1.2.3


From bf2be2de8493dd5f86d6e0f0d4eecb5810ad035b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 18 May 2016 17:55:31 +0530
Subject: cpufreq: governor: Create cpufreq_policy_apply_limits()

Create a new helper to avoid code duplication across governors.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index fdcee3cf38fc..758efd7f3abe 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -463,14 +463,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
 
 	if (!policy->fast_switch_enabled) {
 		mutex_lock(&sg_policy->work_lock);
-
-		if (policy->max < policy->cur)
-			__cpufreq_driver_target(policy, policy->max,
-						CPUFREQ_RELATION_H);
-		else if (policy->min > policy->cur)
-			__cpufreq_driver_target(policy, policy->min,
-						CPUFREQ_RELATION_L);
-
+		cpufreq_policy_apply_limits(policy);
 		mutex_unlock(&sg_policy->work_lock);
 	}
 
-- 
cgit v1.2.3


From 98f368e9e2630a3ce3e80fb10fb2e02038cf9578 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Thu, 2 Jun 2016 23:43:21 -0500
Subject: kernel: Add noaudit variant of ns_capable()

When checking the current cred for a capability in a specific user
namespace, it isn't always desirable to have the LSMs audit the check.
This patch adds a noaudit variant of ns_capable() for when those
situations arise.

The common logic between ns_capable() and the new ns_capable_noaudit()
is moved into a single, shared function to keep duplicated code to a
minimum and ease maintainability.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 kernel/capability.c | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 45432b54d5c6..00411c82dac5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
 	return has_ns_capability_noaudit(t, &init_user_ns, cap);
 }
 
+static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
+{
+	int capable;
+
+	if (unlikely(!cap_valid(cap))) {
+		pr_crit("capable() called with invalid cap=%u\n", cap);
+		BUG();
+	}
+
+	capable = audit ? security_capable(current_cred(), ns, cap) :
+			  security_capable_noaudit(current_cred(), ns, cap);
+	if (capable == 0) {
+		current->flags |= PF_SUPERPRIV;
+		return true;
+	}
+	return false;
+}
+
 /**
  * ns_capable - Determine if the current task has a superior capability in effect
  * @ns:  The usernamespace we want the capability in
@@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
  */
 bool ns_capable(struct user_namespace *ns, int cap)
 {
-	if (unlikely(!cap_valid(cap))) {
-		pr_crit("capable() called with invalid cap=%u\n", cap);
-		BUG();
-	}
-
-	if (security_capable(current_cred(), ns, cap) == 0) {
-		current->flags |= PF_SUPERPRIV;
-		return true;
-	}
-	return false;
+	return ns_capable_common(ns, cap, true);
 }
 EXPORT_SYMBOL(ns_capable);
 
+/**
+ * ns_capable_noaudit - Determine if the current task has a superior capability
+ * (unaudited) in effect
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_noaudit(struct user_namespace *ns, int cap)
+{
+	return ns_capable_common(ns, cap, false);
+}
+EXPORT_SYMBOL(ns_capable_noaudit);
 
 /**
  * capable - Determine if the current task has a superior capability in effect
-- 
cgit v1.2.3


From 4e49ea4a3d276365bf7396c9b77b4d1d5923835a Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Sun, 5 Jun 2016 14:31:41 -0500
Subject: block/fs/drivers: remove rw argument from submit_bio

This has callers of submit_bio/submit_bio_wait set the bio->bi_rw
instead of passing it in. This makes that use the same as
generic_make_request and how we set the other bio fields.

Signed-off-by: Mike Christie <mchristi@redhat.com>

Fixed up fs/ext4/crypto.c

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/power/swap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 160e1006640d..be227f5aa9dc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -271,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
 	bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
 	bio->bi_bdev = hib_resume_bdev;
+	bio->bi_rw = rw;
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
@@ -283,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
 		bio->bi_end_io = hib_end_io;
 		bio->bi_private = hb;
 		atomic_inc(&hb->count);
-		submit_bio(rw, bio);
+		submit_bio(bio);
 	} else {
-		error = submit_bio_wait(rw, bio);
+		error = submit_bio_wait(bio);
 		bio_put(bio);
 	}
 
-- 
cgit v1.2.3


From 162b99e3119767cb6478c55a5aed70469389df88 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Sun, 5 Jun 2016 14:32:02 -0500
Subject: pm: use bio op accessors

Separate the op from the rq_flag_bits and have the pm code
set/get the bio using bio_set_op_attrs/bio_op.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/power/swap.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index be227f5aa9dc..c1aaac431055 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -261,7 +261,7 @@ static void hib_end_io(struct bio *bio)
 	bio_put(bio);
 }
 
-static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
+static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
 		struct hib_bio_batch *hb)
 {
 	struct page *page = virt_to_page(addr);
@@ -271,7 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
 	bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
 	bio->bi_bdev = hib_resume_bdev;
-	bio->bi_rw = rw;
+	bio_set_op_attrs(bio, op, op_flags);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
@@ -307,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
 	int error;
 
-	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+	hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+		      swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -316,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 		swsusp_header->flags = flags;
 		if (flags & SF_CRC32_MODE)
 			swsusp_header->crc32 = handle->crc32;
-		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
-					swsusp_header, NULL);
+		error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+				      swsusp_resume_block, swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Swap header not found!\n");
 		error = -ENODEV;
@@ -390,7 +391,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 	} else {
 		src = buf;
 	}
-	return hib_submit_io(WRITE_SYNC, offset, src, hb);
+	return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb);
 }
 
 static void release_swap_writer(struct swap_map_handle *handle)
@@ -993,7 +994,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
 			return -ENOMEM;
 		}
 
-		error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL);
+		error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset,
+				      tmp->map, NULL);
 		if (error) {
 			release_swap_reader(handle);
 			return error;
@@ -1017,7 +1019,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 	offset = handle->cur->entries[handle->k];
 	if (!offset)
 		return -EFAULT;
-	error = hib_submit_io(READ_SYNC, offset, buf, hb);
+	error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb);
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1526,7 +1528,8 @@ int swsusp_check(void)
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
-		error = hib_submit_io(READ_SYNC, swsusp_resume_block,
+		error = hib_submit_io(REQ_OP_READ, READ_SYNC,
+					swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
 			goto put;
@@ -1534,7 +1537,8 @@ int swsusp_check(void)
 		if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
 			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
 			/* Reset swap signature now */
-			error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+			error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+						swsusp_resume_block,
 						swsusp_header, NULL);
 		} else {
 			error = -EINVAL;
@@ -1578,10 +1582,12 @@ int swsusp_unmark(void)
 {
 	int error;
 
-	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
+	hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+		      swsusp_header, NULL);
 	if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
-		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
+		error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+					swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
-- 
cgit v1.2.3


From 1b9a9ab78b0ab79dc1f0ddd5fbed7833ec7de3a4 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Sun, 5 Jun 2016 14:32:18 -0500
Subject: blktrace: use op accessors

Have blktrace use the req/bio op accessor to get the REQ_OP.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 62 ++++++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9aef8654e90d..2d16fad519b2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -199,7 +199,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
  * blk_io_trace structure and places it in a per-cpu subbuffer.
  */
 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
-		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+		     int op, int op_flags, u32 what, int error, int pdu_len,
+		     void *pdu_data)
 {
 	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
@@ -214,13 +215,14 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
 		return;
 
-	what |= ddir_act[rw & WRITE];
-	what |= MASK_TC_BIT(rw, SYNC);
-	what |= MASK_TC_BIT(rw, RAHEAD);
-	what |= MASK_TC_BIT(rw, META);
-	what |= MASK_TC_BIT(rw, DISCARD);
-	what |= MASK_TC_BIT(rw, FLUSH);
-	what |= MASK_TC_BIT(rw, FUA);
+	what |= ddir_act[op_is_write(op) ? WRITE : READ];
+	what |= MASK_TC_BIT(op_flags, SYNC);
+	what |= MASK_TC_BIT(op_flags, RAHEAD);
+	what |= MASK_TC_BIT(op_flags, META);
+	what |= MASK_TC_BIT(op_flags, FLUSH);
+	what |= MASK_TC_BIT(op_flags, FUA);
+	if (op == REQ_OP_DISCARD)
+		what |= BLK_TC_ACT(BLK_TC_DISCARD);
 
 	pid = tsk->pid;
 	if (act_log_check(bt, what, sector, pid))
@@ -708,11 +710,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
+		__blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
 				what, rq->errors, rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
+		__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
 				rq->cmd_flags, what, rq->errors, 0, NULL);
 	}
 }
@@ -770,7 +772,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 		return;
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio->bi_rw, what, error, 0, NULL);
+			bio_op(bio), bio->bi_rw, what, error, 0, NULL);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
@@ -818,7 +820,8 @@ static void blk_add_trace_getrq(void *ignore,
 		struct blk_trace *bt = q->blk_trace;
 
 		if (bt)
-			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
+					NULL);
 	}
 }
 
@@ -833,7 +836,7 @@ static void blk_add_trace_sleeprq(void *ignore,
 		struct blk_trace *bt = q->blk_trace;
 
 		if (bt)
-			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
 					0, 0, NULL);
 	}
 }
@@ -843,7 +846,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 	struct blk_trace *bt = q->blk_trace;
 
 	if (bt)
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
 
 static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -860,7 +863,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 		else
 			what = BLK_TA_UNPLUG_TIMER;
 
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
 	}
 }
 
@@ -874,8 +877,9 @@ static void blk_add_trace_split(void *ignore,
 		__be64 rpdu = cpu_to_be64(pdu);
 
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
-				bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
-				bio->bi_error, sizeof(rpdu), &rpdu);
+				bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw,
+				BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
+				&rpdu);
 	}
 }
 
@@ -907,7 +911,7 @@ static void blk_add_trace_bio_remap(void *ignore,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+			bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
 			sizeof(r), &r);
 }
 
@@ -940,7 +944,7 @@ static void blk_add_trace_rq_remap(void *ignore,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
-			rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
+			rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
 			sizeof(r), &r);
 }
 
@@ -965,10 +969,10 @@ void blk_add_driver_data(struct request_queue *q,
 		return;
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
-		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
 				BLK_TA_DRV_DATA, rq->errors, len, data);
 	else
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
 				BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1769,21 +1773,27 @@ void blk_dump_cmd(char *buf, struct request *rq)
 	}
 }
 
-void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 {
 	int i = 0;
 
 	if (rw & REQ_FLUSH)
 		rwbs[i++] = 'F';
 
-	if (rw & WRITE)
+	switch (op) {
+	case REQ_OP_WRITE:
+	case REQ_OP_WRITE_SAME:
 		rwbs[i++] = 'W';
-	else if (rw & REQ_DISCARD)
+		break;
+	case REQ_OP_DISCARD:
 		rwbs[i++] = 'D';
-	else if (bytes)
+		break;
+	case REQ_OP_READ:
 		rwbs[i++] = 'R';
-	else
+		break;
+	default:
 		rwbs[i++] = 'N';
+	}
 
 	if (rw & REQ_FUA)
 		rwbs[i++] = 'F';
-- 
cgit v1.2.3


From 3a5e02ced11e22ecd9da3d6710afe15bcfee1d10 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Sun, 5 Jun 2016 14:32:23 -0500
Subject: block, drivers: add REQ_OP_FLUSH operation

This adds a REQ_OP_FLUSH operation that is sent to request_fn
based drivers by the block layer's flush code, instead of
sending requests with the request->cmd_flags REQ_FLUSH bit set.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2d16fad519b2..0c70fbb6ea8d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -223,6 +223,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(op_flags, FUA);
 	if (op == REQ_OP_DISCARD)
 		what |= BLK_TC_ACT(BLK_TC_DISCARD);
+	if (op == REQ_OP_FLUSH)
+		what |= BLK_TC_ACT(BLK_TC_FLUSH);
 
 	pid = tsk->pid;
 	if (act_log_check(bt, what, sector, pid))
@@ -1788,6 +1790,9 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 	case REQ_OP_DISCARD:
 		rwbs[i++] = 'D';
 		break;
+	case REQ_OP_FLUSH:
+		rwbs[i++] = 'F';
+		break;
 	case REQ_OP_READ:
 		rwbs[i++] = 'R';
 		break;
-- 
cgit v1.2.3


From 28a8f0d317bf225ff15008f5dd66ae16242dd843 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Sun, 5 Jun 2016 14:32:25 -0500
Subject: block, drivers, fs: rename REQ_FLUSH to REQ_PREFLUSH

To avoid confusion between REQ_OP_FLUSH, which is handled by
request_fn drivers, and upper layers requesting the block layer
perform a flush sequence along with possibly a WRITE, this patch
renames REQ_FLUSH to REQ_PREFLUSH.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 0c70fbb6ea8d..03b0dd98ff0e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -189,6 +189,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
 				 BLK_TC_ACT(BLK_TC_WRITE) };
 
 #define BLK_TC_RAHEAD		BLK_TC_AHEAD
+#define BLK_TC_PREFLUSH		BLK_TC_FLUSH
 
 /* The ilog2() calls fall out because they're constant */
 #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
@@ -219,7 +220,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(op_flags, SYNC);
 	what |= MASK_TC_BIT(op_flags, RAHEAD);
 	what |= MASK_TC_BIT(op_flags, META);
-	what |= MASK_TC_BIT(op_flags, FLUSH);
+	what |= MASK_TC_BIT(op_flags, PREFLUSH);
 	what |= MASK_TC_BIT(op_flags, FUA);
 	if (op == REQ_OP_DISCARD)
 		what |= BLK_TC_ACT(BLK_TC_DISCARD);
@@ -1779,7 +1780,7 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 {
 	int i = 0;
 
-	if (rw & REQ_FLUSH)
+	if (rw & REQ_PREFLUSH)
 		rwbs[i++] = 'F';
 
 	switch (op) {
-- 
cgit v1.2.3


From 288dab8a35a0bde426a09870943c8d3ee3a50dab Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Jun 2016 16:00:36 +0200
Subject: block: add a separate operation type for secure erase

Instead of overloading the discard support with the REQ_SECURE flag.
Use the opportunity to rename the queue flag as well, and remove the
dead checks for this flag in the RAID 1 and RAID 10 drivers that don't
claim support for secure erase.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 03b0dd98ff0e..af49caf973eb 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1791,6 +1791,10 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 	case REQ_OP_DISCARD:
 		rwbs[i++] = 'D';
 		break;
+	case REQ_OP_SECURE_ERASE:
+		rwbs[i++] = 'D';
+		rwbs[i++] = 'E';
+		break;
 	case REQ_OP_FLUSH:
 		rwbs[i++] = 'F';
 		break;
@@ -1809,8 +1813,6 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 		rwbs[i++] = 'S';
 	if (rw & REQ_META)
 		rwbs[i++] = 'M';
-	if (rw & REQ_SECURE)
-		rwbs[i++] = 'E';
 
 	rwbs[i] = '\0';
 }
-- 
cgit v1.2.3


From 678309117768e25751594a48a2d873b0552a3130 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Thu, 2 Jun 2016 13:20:32 +0100
Subject: PM / Hibernate: Don't let kasan instrument snapshot.c

Kasan causes the compiler to instrument C code and is used at runtime to
detect accesses to memory that has been freed, or not yet allocated.

The code in snapshot.c saves and restores memory when hibernating. This will
access whole pages in the slab cache that have both free and allocated
areas, resulting in a large number of false positives from Kasan.

Disable instrumentation of this file.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index cb880a14cc39..eb4f717705ba 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,6 +1,8 @@
 
 ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
 
+KASAN_SANITIZE_snapshot.o	:= n
+
 obj-y				+= qos.o
 obj-$(CONFIG_PM)		+= main.o
 obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o
-- 
cgit v1.2.3


From 2f275de5d1ed7269913ef9b4c64a13952c0a38e8 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 27 May 2016 12:57:02 -0700
Subject: seccomp: Add a seccomp_data parameter secure_computing()

Currently, if arch code wants to supply seccomp_data directly to
seccomp (which is generally much faster than having seccomp do it
using the syscall_get_xyz() API), it has to use the two-phase
seccomp hooks. Add it to the easy hooks, too.

Cc: linux-arch@vger.kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7002796f14a4..06816290a212 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -554,9 +554,9 @@ void secure_computing_strict(int this_syscall)
 		BUG();
 }
 #else
-int __secure_computing(void)
+int __secure_computing(const struct seccomp_data *sd)
 {
-	u32 phase1_result = seccomp_phase1(NULL);
+	u32 phase1_result = seccomp_phase1(sd);
 
 	if (likely(phase1_result == SECCOMP_PHASE1_OK))
 		return 0;
-- 
cgit v1.2.3


From 8112c4f140fa03f9ee68aad2cc79afa7df5418d3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 1 Jun 2016 16:02:17 -0700
Subject: seccomp: remove 2-phase API

Since nothing is using the 2-phase API, and it adds more complexity than
benefit, remove it.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@kernel.org>
---
 kernel/seccomp.c | 129 ++++++++++++++++++-------------------------------------
 1 file changed, 41 insertions(+), 88 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 06816290a212..14a37d71b612 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -173,7 +173,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
  *
  * Returns valid seccomp BPF response codes.
  */
-static u32 seccomp_run_filters(struct seccomp_data *sd)
+static u32 seccomp_run_filters(const struct seccomp_data *sd)
 {
 	struct seccomp_data sd_local;
 	u32 ret = SECCOMP_RET_ALLOW;
@@ -554,20 +554,9 @@ void secure_computing_strict(int this_syscall)
 		BUG();
 }
 #else
-int __secure_computing(const struct seccomp_data *sd)
-{
-	u32 phase1_result = seccomp_phase1(sd);
-
-	if (likely(phase1_result == SECCOMP_PHASE1_OK))
-		return 0;
-	else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
-		return -1;
-	else
-		return seccomp_phase2(phase1_result);
-}
 
 #ifdef CONFIG_SECCOMP_FILTER
-static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
 {
 	u32 filter_ret, action;
 	int data;
@@ -599,10 +588,33 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
 		goto skip;
 
 	case SECCOMP_RET_TRACE:
-		return filter_ret;  /* Save the rest for phase 2. */
+		/* ENOSYS these calls if there is no tracer attached. */
+		if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+			syscall_set_return_value(current,
+						 task_pt_regs(current),
+						 -ENOSYS, 0);
+			goto skip;
+		}
+
+		/* Allow the BPF to provide the event message */
+		ptrace_event(PTRACE_EVENT_SECCOMP, data);
+		/*
+		 * The delivery of a fatal signal during event
+		 * notification may silently skip tracer notification.
+		 * Terminating the task now avoids executing a system
+		 * call that may not be intended.
+		 */
+		if (fatal_signal_pending(current))
+			do_exit(SIGSYS);
+		/* Check if the tracer forced the syscall to be skipped. */
+		this_syscall = syscall_get_nr(current, task_pt_regs(current));
+		if (this_syscall < 0)
+			goto skip;
+
+		return 0;
 
 	case SECCOMP_RET_ALLOW:
-		return SECCOMP_PHASE1_OK;
+		return 0;
 
 	case SECCOMP_RET_KILL:
 	default:
@@ -614,96 +626,37 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
 
 skip:
 	audit_seccomp(this_syscall, 0, action);
-	return SECCOMP_PHASE1_SKIP;
+	return -1;
+}
+#else
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
+{
+	BUG();
 }
 #endif
 
-/**
- * seccomp_phase1() - run fast path seccomp checks on the current syscall
- * @arg sd: The seccomp_data or NULL
- *
- * This only reads pt_regs via the syscall_xyz helpers.  The only change
- * it will make to pt_regs is via syscall_set_return_value, and it will
- * only do that if it returns SECCOMP_PHASE1_SKIP.
- *
- * If sd is provided, it will not read pt_regs at all.
- *
- * It may also call do_exit or force a signal; these actions must be
- * safe.
- *
- * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
- * be processed normally.
- *
- * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
- * invoked.  In this case, seccomp_phase1 will have set the return value
- * using syscall_set_return_value.
- *
- * If it returns anything else, then the return value should be passed
- * to seccomp_phase2 from a context in which ptrace hooks are safe.
- */
-u32 seccomp_phase1(struct seccomp_data *sd)
+int __secure_computing(const struct seccomp_data *sd)
 {
 	int mode = current->seccomp.mode;
-	int this_syscall = sd ? sd->nr :
-		syscall_get_nr(current, task_pt_regs(current));
+	int this_syscall;
 
 	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
-		return SECCOMP_PHASE1_OK;
+		return 0;
+
+	this_syscall = sd ? sd->nr :
+		syscall_get_nr(current, task_pt_regs(current));
 
 	switch (mode) {
 	case SECCOMP_MODE_STRICT:
 		__secure_computing_strict(this_syscall);  /* may call do_exit */
-		return SECCOMP_PHASE1_OK;
-#ifdef CONFIG_SECCOMP_FILTER
+		return 0;
 	case SECCOMP_MODE_FILTER:
-		return __seccomp_phase1_filter(this_syscall, sd);
-#endif
+		return __seccomp_filter(this_syscall, sd);
 	default:
 		BUG();
 	}
 }
-
-/**
- * seccomp_phase2() - finish slow path seccomp work for the current syscall
- * @phase1_result: The return value from seccomp_phase1()
- *
- * This must be called from a context in which ptrace hooks can be used.
- *
- * Returns 0 if the syscall should be processed or -1 to skip the syscall.
- */
-int seccomp_phase2(u32 phase1_result)
-{
-	struct pt_regs *regs = task_pt_regs(current);
-	u32 action = phase1_result & SECCOMP_RET_ACTION;
-	int data = phase1_result & SECCOMP_RET_DATA;
-
-	BUG_ON(action != SECCOMP_RET_TRACE);
-
-	audit_seccomp(syscall_get_nr(current, regs), 0, action);
-
-	/* Skip these calls if there is no tracer. */
-	if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
-		syscall_set_return_value(current, regs,
-					 -ENOSYS, 0);
-		return -1;
-	}
-
-	/* Allow the BPF to provide the event message */
-	ptrace_event(PTRACE_EVENT_SECCOMP, data);
-	/*
-	 * The delivery of a fatal signal during event
-	 * notification may silently skip tracer notification.
-	 * Terminating the task now avoids executing a system
-	 * call that may not be intended.
-	 */
-	if (fatal_signal_pending(current))
-		do_exit(SIGSYS);
-	if (syscall_get_nr(current, regs) < 0)
-		return -1;  /* Explicit request to skip. */
-
-	return 0;
-}
 #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
 
 long prctl_get_seccomp(void)
-- 
cgit v1.2.3


From ce6526e8afa4b6ad0ab134a4cc50c9c863319637 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 1 Jun 2016 19:29:15 -0700
Subject: seccomp: recheck the syscall after RET_TRACE

When RET_TRACE triggers, a tracer may change a syscall into something that
should be filtered by seccomp. This re-runs seccomp after a trace event
to make sure things continue to pass.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@kernel.org>
---
 kernel/seccomp.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 14a37d71b612..54d15eb2b701 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -556,7 +556,8 @@ void secure_computing_strict(int this_syscall)
 #else
 
 #ifdef CONFIG_SECCOMP_FILTER
-static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+			    const bool recheck_after_trace)
 {
 	u32 filter_ret, action;
 	int data;
@@ -588,6 +589,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
 		goto skip;
 
 	case SECCOMP_RET_TRACE:
+		/* We've been put in this state by the ptracer already. */
+		if (recheck_after_trace)
+			return 0;
+
 		/* ENOSYS these calls if there is no tracer attached. */
 		if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
 			syscall_set_return_value(current,
@@ -611,6 +616,15 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
 		if (this_syscall < 0)
 			goto skip;
 
+		/*
+		 * Recheck the syscall, since it may have changed. This
+		 * intentionally uses a NULL struct seccomp_data to force
+		 * a reload of all registers. This does not goto skip since
+		 * a skip would have already been reported.
+		 */
+		if (__seccomp_filter(this_syscall, NULL, true))
+			return -1;
+
 		return 0;
 
 	case SECCOMP_RET_ALLOW:
@@ -629,7 +643,8 @@ skip:
 	return -1;
 }
 #else
-static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+			    const bool recheck_after_trace)
 {
 	BUG();
 }
@@ -652,7 +667,7 @@ int __secure_computing(const struct seccomp_data *sd)
 		__secure_computing_strict(this_syscall);  /* may call do_exit */
 		return 0;
 	case SECCOMP_MODE_FILTER:
-		return __seccomp_filter(this_syscall, sd);
+		return __seccomp_filter(this_syscall, sd, false);
 	default:
 		BUG();
 	}
-- 
cgit v1.2.3


From ca5f2b4c4fb7bb7397317ee2ead83485aa295a3e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 14 Jun 2016 16:23:22 +0200
Subject: PM / sleep: Make pm_prepare_console() return void

Nothing is using its return value so change it to return void.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/console.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/console.c b/kernel/power/console.c
index aba9c545a0e3..0e781798b0b3 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -126,17 +126,17 @@ out:
 	return ret;
 }
 
-int pm_prepare_console(void)
+void pm_prepare_console(void)
 {
 	if (!pm_vt_switch())
-		return 0;
+		return;
 
 	orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
 	if (orig_fgconsole < 0)
-		return 1;
+		return;
 
 	orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
-	return 0;
+	return;
 }
 
 void pm_restore_console(void)
-- 
cgit v1.2.3


From 61d1b6a42fec61c5065f54cc62cef02b483c69fb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jun 2016 22:47:12 +0200
Subject: bpf, maps: add release callback

Add a release callback for maps that is invoked when the last
reference to its struct file is gone and the struct file about
to be released by vfs. The handler will be used by fd array maps.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 46ecce4b79ed..fc3adcd064b1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map)
 
 static int bpf_map_release(struct inode *inode, struct file *filp)
 {
-	bpf_map_put_with_uref(filp->private_data);
+	struct bpf_map *map = filp->private_data;
+
+	if (map->ops->map_release)
+		map->ops->map_release(map, filp);
+
+	bpf_map_put_with_uref(map);
 	return 0;
 }
 
-- 
cgit v1.2.3


From d056a788765e67773124f520159185bc89f5d1ad Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jun 2016 22:47:13 +0200
Subject: bpf, maps: extend map_fd_get_ptr arguments

This patch extends map_fd_get_ptr() callback that is used by fd array
maps, so that struct file pointer from the related map can be passed
in. It's safe to remove map_update_elem() callback for the two maps since
this is only allowed from syscall side, but not from eBPF programs for these
two map types. Like in per-cpu map case, bpf_fd_array_map_update_elem()
needs to be called directly here due to the extra argument.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 16 +++++++++-------
 kernel/bpf/syscall.c  |  6 ++++++
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 76d5a794e426..bfedcbdb4d84 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 }
 
 /* only called from syscall */
-static int fd_array_map_update_elem(struct bpf_map *map, void *key,
-				    void *value, u64 map_flags)
+int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
+				 void *key, void *value, u64 map_flags)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	void *new_ptr, *old_ptr;
@@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key,
 		return -E2BIG;
 
 	ufd = *(u32 *)value;
-	new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+	new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
 	if (IS_ERR(new_ptr))
 		return PTR_ERR(new_ptr);
 
@@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 	}
 }
 
-static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *prog_fd_array_get_ptr(struct bpf_map *map,
+				   struct file *map_file, int fd)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_prog *prog = bpf_prog_get(fd);
+
 	if (IS_ERR(prog))
 		return prog;
 
@@ -382,6 +384,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
 		bpf_prog_put(prog);
 		return ERR_PTR(-EINVAL);
 	}
+
 	return prog;
 }
 
@@ -407,7 +410,6 @@ static const struct bpf_map_ops prog_array_ops = {
 	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
-	.map_update_elem = fd_array_map_update_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = prog_fd_array_get_ptr,
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
@@ -431,7 +433,8 @@ static void perf_event_array_map_free(struct bpf_map *map)
 	fd_array_map_free(map);
 }
 
-static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
+					 struct file *map_file, int fd)
 {
 	struct perf_event *event;
 	const struct perf_event_attr *attr;
@@ -474,7 +477,6 @@ static const struct bpf_map_ops perf_event_array_ops = {
 	.map_free = perf_event_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
-	.map_update_elem = fd_array_map_update_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
 	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fc3adcd064b1..c23a4e9311b3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -392,6 +392,12 @@ static int map_update_elem(union bpf_attr *attr)
 		err = bpf_percpu_hash_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_update(map, key, value, attr->flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+		rcu_read_lock();
+		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+						   attr->flags);
+		rcu_read_unlock();
 	} else {
 		rcu_read_lock();
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
-- 
cgit v1.2.3


From 3b1efb196eee45b2f0c4994e0c43edb5e367f620 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jun 2016 22:47:14 +0200
Subject: bpf, maps: flush own entries on perf map release

The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.

Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").

To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c    | 102 ++++++++++++++++++++++++++++++++++-------------
 kernel/trace/bpf_trace.c |  18 ++++-----
 2 files changed, 82 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index bfedcbdb4d84..5af30732697b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -427,59 +427,105 @@ static int __init register_prog_array_map(void)
 }
 late_initcall(register_prog_array_map);
 
-static void perf_event_array_map_free(struct bpf_map *map)
+static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
+						   struct file *map_file)
 {
-	bpf_fd_array_map_clear(map);
-	fd_array_map_free(map);
+	struct bpf_event_entry *ee;
+
+	ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+	if (ee) {
+		ee->event = perf_file->private_data;
+		ee->perf_file = perf_file;
+		ee->map_file = map_file;
+	}
+
+	return ee;
+}
+
+static void __bpf_event_entry_free(struct rcu_head *rcu)
+{
+	struct bpf_event_entry *ee;
+
+	ee = container_of(rcu, struct bpf_event_entry, rcu);
+	fput(ee->perf_file);
+	kfree(ee);
+}
+
+static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
+{
+	call_rcu(&ee->rcu, __bpf_event_entry_free);
 }
 
 static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 					 struct file *map_file, int fd)
 {
-	struct perf_event *event;
 	const struct perf_event_attr *attr;
-	struct file *file;
+	struct bpf_event_entry *ee;
+	struct perf_event *event;
+	struct file *perf_file;
 
-	file = perf_event_get(fd);
-	if (IS_ERR(file))
-		return file;
+	perf_file = perf_event_get(fd);
+	if (IS_ERR(perf_file))
+		return perf_file;
 
-	event = file->private_data;
+	event = perf_file->private_data;
+	ee = ERR_PTR(-EINVAL);
 
 	attr = perf_event_attrs(event);
-	if (IS_ERR(attr))
-		goto err;
-
-	if (attr->inherit)
-		goto err;
-
-	if (attr->type == PERF_TYPE_RAW)
-		return file;
-
-	if (attr->type == PERF_TYPE_HARDWARE)
-		return file;
+	if (IS_ERR(attr) || attr->inherit)
+		goto err_out;
+
+	switch (attr->type) {
+	case PERF_TYPE_SOFTWARE:
+		if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
+			goto err_out;
+		/* fall-through */
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+		ee = bpf_event_entry_gen(perf_file, map_file);
+		if (ee)
+			return ee;
+		ee = ERR_PTR(-ENOMEM);
+		/* fall-through */
+	default:
+		break;
+	}
 
-	if (attr->type == PERF_TYPE_SOFTWARE &&
-	    attr->config == PERF_COUNT_SW_BPF_OUTPUT)
-		return file;
-err:
-	fput(file);
-	return ERR_PTR(-EINVAL);
+err_out:
+	fput(perf_file);
+	return ee;
 }
 
 static void perf_event_fd_array_put_ptr(void *ptr)
 {
-	fput((struct file *)ptr);
+	bpf_event_entry_free_rcu(ptr);
+}
+
+static void perf_event_fd_array_release(struct bpf_map *map,
+					struct file *map_file)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_event_entry *ee;
+	int i;
+
+	rcu_read_lock();
+	for (i = 0; i < array->map.max_entries; i++) {
+		ee = READ_ONCE(array->ptrs[i]);
+		if (ee && ee->map_file == map_file)
+			fd_array_map_delete_elem(map, &i);
+	}
+	rcu_read_unlock();
 }
 
 static const struct bpf_map_ops perf_event_array_ops = {
 	.map_alloc = fd_array_map_alloc,
-	.map_free = perf_event_array_map_free,
+	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
 	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+	.map_release = perf_event_fd_array_release,
 };
 
 static struct bpf_map_type_list perf_event_array_type __read_mostly = {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 720b7bb01d43..037ea6ea3cb2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -192,18 +192,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 {
 	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_event_entry *ee;
 	struct perf_event *event;
-	struct file *file;
 
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	file = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!file))
+	ee = READ_ONCE(array->ptrs[index]);
+	if (unlikely(!ee))
 		return -ENOENT;
 
-	event = file->private_data;
-
+	event = ee->event;
 	/* make sure event is local and doesn't have pmu::count */
 	if (event->oncpu != smp_processor_id() ||
 	    event->pmu->count)
@@ -233,8 +232,8 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	u64 index = flags & BPF_F_INDEX_MASK;
 	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
+	struct bpf_event_entry *ee;
 	struct perf_event *event;
-	struct file *file;
 	struct perf_raw_record raw = {
 		.size = size,
 		.data = data,
@@ -247,12 +246,11 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	file = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!file))
+	ee = READ_ONCE(array->ptrs[index]);
+	if (unlikely(!ee))
 		return -ENOENT;
 
-	event = file->private_data;
-
+	event = ee->event;
 	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
 		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
 		return -EINVAL;
-- 
cgit v1.2.3


From 66b12abc846d31e75fa5f2f31db1396ddfa8ee4a Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Thu, 16 Jun 2016 17:08:19 -0400
Subject: audit: fix some horrible switch statement style crimes

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditfilter.c | 8 ++++++--
 kernel/auditsc.c     | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 8a8aa3fbc8d8..ff59a5eed691 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1343,8 +1343,12 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type,
 			return result;
 	}
 	switch (rule->action) {
-	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
-	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	case AUDIT_NEVER:
+		*state = AUDIT_DISABLED;
+		break;
+	case AUDIT_ALWAYS:
+		*state = AUDIT_RECORD_CONTEXT;
+		break;
 	}
 	return 1;
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7d0e3cf8abe1..ec4c552876a7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -695,8 +695,12 @@ static int audit_filter_rules(struct task_struct *tsk,
 		ctx->prio = rule->prio;
 	}
 	switch (rule->action) {
-	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
-	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	case AUDIT_NEVER:
+		*state = AUDIT_DISABLED;
+		break;
+	case AUDIT_ALWAYS:
+		*state = AUDIT_RECORD_CONTEXT;
+		break;
 	}
 	return 1;
 }
-- 
cgit v1.2.3


From 8c8a5502183c724854afd2f143a72f7eb71b6fea Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 17 Jun 2016 12:23:59 -0400
Subject: cgroup: fix idr leak for the first cgroup root

The valid cgroup hierarchy ID range includes 0, so we can't filter for
positive numbers when freeing it, or it'll leak the first ID. No big
deal, just disruptive when reading the code.

The ID is freed during error handling and when the reference count
hits zero, so the double-free test is not necessary; remove it.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 86cb5c6e8932..36fc0ff506c3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1158,18 +1158,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 {
 	lockdep_assert_held(&cgroup_mutex);
 
-	if (root->hierarchy_id) {
-		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
-		root->hierarchy_id = 0;
-	}
+	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 }
 
 static void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
-		/* hierarchy ID should already have been released */
-		WARN_ON_ONCE(root->hierarchy_id);
-
 		idr_destroy(&root->cgroup_idr);
 		kfree(root);
 	}
-- 
cgit v1.2.3


From d6ccc55e66ccdbc8ad0eeda14419f8eaccbc246b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 17 Jun 2016 12:24:27 -0400
Subject: cgroup: remove unnecessary 0 check from css_from_id()

css_idr allocation starts at 1, so index 0 will never point to an
item. css_from_id() currently filters that before asking idr_find(),
but idr_find() would also just return NULL, so this is not needed.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 36fc0ff506c3..78f6d18ff0af 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6162,7 +6162,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
-	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
+	return idr_find(&ss->css_idr, id);
 }
 
 /**
-- 
cgit v1.2.3


From 59a37f8baeb2c9d97f316584c90892d18bf846d4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 17 Jun 2016 16:58:26 +0200
Subject: blktrace: avoid using timespec

The blktrace code stores the current time in a 32-bit word in its
user interface. This is a bad idea because 32-bit seconds overflow
at some point.

We probably have until 2106 before this one overflows, as it seems
to use an 'unsigned' variable, but we should confirm that user
space treats it the same way.

Aside from this, we want to stop using 'struct timespec' here,
so I'm adding a comment about the overflow and change the code
to use timespec64 instead to make the loss of range more obvious.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 03b0dd98ff0e..bedb84d168d1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk)
 
 static void trace_note_time(struct blk_trace *bt)
 {
-	struct timespec now;
+	struct timespec64 now;
 	unsigned long flags;
 	u32 words[2];
 
-	getnstimeofday(&now);
-	words[0] = now.tv_sec;
+	/* need to check user space to see if this breaks in y2038 or y2106 */
+	ktime_get_real_ts64(&now);
+	words[0] = (u32)now.tv_sec;
 	words[1] = now.tv_nsec;
 
 	local_irq_save(flags);
-- 
cgit v1.2.3


From 4e267db135c44d0b18e553899fe7df32b89211a5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Apr 2016 07:38:13 -0400
Subject: tracing: Make the pid filtering helper functions global

Make the functions used for pid filtering global for tracing, such that the
function tracer can use the pid code as well.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h        |  9 +++++++++
 kernel/trace/trace_events.c | 34 +++++++++++++++++-----------------
 2 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5167c366d6b7..172330891c6d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -628,6 +628,15 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
 extern unsigned long tracing_thresh;
 
+/* PID filtering */
+bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
+			     pid_t search_pid);
+bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
+			    struct task_struct *task);
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+				  struct task_struct *self,
+				  struct task_struct *task);
+
 #ifdef CONFIG_TRACER_MAX_TRACE
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3d4155892a1e..b5e514c4dada 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -503,8 +503,8 @@ static void ftrace_clear_events(struct trace_array *tr)
 extern int pid_max;
 
 /* Returns true if found in filter */
-static bool
-find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
+bool
+trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
 {
 	/*
 	 * If pid_max changed after filtered_pids was created, we
@@ -516,8 +516,8 @@ find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
 	return test_bit(search_pid, filtered_pids->pids);
 }
 
-static bool
-ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+bool
+trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 {
 	/*
 	 * Return false, because if filtered_pids does not exist,
@@ -526,19 +526,19 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 	if (!filtered_pids)
 		return false;
 
-	return !find_filtered_pid(filtered_pids, task->pid);
+	return !trace_find_filtered_pid(filtered_pids, task->pid);
 }
 
-static void filter_add_remove_task(struct trace_pid_list *pid_list,
-				   struct task_struct *self,
-				   struct task_struct *task)
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+				  struct task_struct *self,
+				  struct task_struct *task)
 {
 	if (!pid_list)
 		return;
 
 	/* For forks, we only add if the forking task is listed */
 	if (self) {
-		if (!find_filtered_pid(pid_list, self->pid))
+		if (!trace_find_filtered_pid(pid_list, self->pid))
 			return;
 	}
 
@@ -560,7 +560,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
 	struct trace_array *tr = data;
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
-	filter_add_remove_task(pid_list, NULL, task);
+	trace_filter_add_remove_task(pid_list, NULL, task);
 }
 
 static void
@@ -572,7 +572,7 @@ event_filter_pid_sched_process_fork(void *data,
 	struct trace_array *tr = data;
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
-	filter_add_remove_task(pid_list, self, task);
+	trace_filter_add_remove_task(pid_list, self, task);
 }
 
 void trace_event_follow_fork(struct trace_array *tr, bool enable)
@@ -600,8 +600,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       ignore_this_task(pid_list, prev) &&
-		       ignore_this_task(pid_list, next));
+		       trace_ignore_this_task(pid_list, prev) &&
+		       trace_ignore_this_task(pid_list, next));
 }
 
 static void
@@ -614,7 +614,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       ignore_this_task(pid_list, next));
+		       trace_ignore_this_task(pid_list, next));
 }
 
 static void
@@ -630,7 +630,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       ignore_this_task(pid_list, task));
+		       trace_ignore_this_task(pid_list, task));
 }
 
 static void
@@ -647,7 +647,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
 
 	/* Set tracing if current is enabled */
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       ignore_this_task(pid_list, current));
+		       trace_ignore_this_task(pid_list, current));
 }
 
 static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -1654,7 +1654,7 @@ static void ignore_task_cpu(void *data)
 					     mutex_is_locked(&event_mutex));
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       ignore_this_task(pid_list, current));
+		       trace_ignore_this_task(pid_list, current));
 }
 
 static ssize_t
-- 
cgit v1.2.3


From d8275c454dcdba296675221b4c12f19d1b6e0ee8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Apr 2016 12:15:22 -0400
Subject: tracing: Move filtered_pid helper functions into trace.c

As the filtered_pid functions are going to be used by function tracer as
well as trace_events, move the code into the generic trace.c file.

The functions moved are:

 trace_find_filtered_pid()
 trace_ignore_this_task()
 trace_filter_add_remove_task()

Kernel Doc text was also added.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 78 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.c | 51 -----------------------------
 2 files changed, 78 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a4bd6b68a0b..0b87fe8e6d0b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -319,6 +319,84 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
 	return 0;
 }
 
+/**
+ * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
+ * @filtered_pids: The list of pids to check
+ * @search_pid: The PID to find in @filtered_pids
+ *
+ * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis.
+ */
+bool
+trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
+{
+	/*
+	 * If pid_max changed after filtered_pids was created, we
+	 * by default ignore all pids greater than the previous pid_max.
+	 */
+	if (search_pid >= filtered_pids->pid_max)
+		return false;
+
+	return test_bit(search_pid, filtered_pids->pids);
+}
+
+/**
+ * trace_ignore_this_task - should a task be ignored for tracing
+ * @filtered_pids: The list of pids to check
+ * @task: The task that should be ignored if not filtered
+ *
+ * Checks if @task should be traced or not from @filtered_pids.
+ * Returns true if @task should *NOT* be traced.
+ * Returns false if @task should be traced.
+ */
+bool
+trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
+	/*
+	 * Return false, because if filtered_pids does not exist,
+	 * all pids are good to trace.
+	 */
+	if (!filtered_pids)
+		return false;
+
+	return !trace_find_filtered_pid(filtered_pids, task->pid);
+}
+
+/**
+ * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * @pid_list: The list to modify
+ * @self: The current task for fork or NULL for exit
+ * @task: The task to add or remove
+ *
+ * If adding a task, if @self is defined, the task is only added if @self
+ * is also included in @pid_list. This happens on fork and tasks should
+ * only be added when the parent is listed. If @self is NULL, then the
+ * @task pid will be removed from the list, which would happen on exit
+ * of a task.
+ */
+void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
+				  struct task_struct *self,
+				  struct task_struct *task)
+{
+	if (!pid_list)
+		return;
+
+	/* For forks, we only add if the forking task is listed */
+	if (self) {
+		if (!trace_find_filtered_pid(pid_list, self->pid))
+			return;
+	}
+
+	/* Sorry, but we don't support pid_max changing after setting */
+	if (task->pid >= pid_list->pid_max)
+		return;
+
+	/* "self" is set for forks, and NULL for exits */
+	if (self)
+		set_bit(task->pid, pid_list->pids);
+	else
+		clear_bit(task->pid, pid_list->pids);
+}
+
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
 	u64 ts;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b5e514c4dada..a11e6d9a3841 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -502,57 +502,6 @@ static void ftrace_clear_events(struct trace_array *tr)
 /* Shouldn't this be in a header? */
 extern int pid_max;
 
-/* Returns true if found in filter */
-bool
-trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
-{
-	/*
-	 * If pid_max changed after filtered_pids was created, we
-	 * by default ignore all pids greater than the previous pid_max.
-	 */
-	if (search_pid >= filtered_pids->pid_max)
-		return false;
-
-	return test_bit(search_pid, filtered_pids->pids);
-}
-
-bool
-trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
-{
-	/*
-	 * Return false, because if filtered_pids does not exist,
-	 * all pids are good to trace.
-	 */
-	if (!filtered_pids)
-		return false;
-
-	return !trace_find_filtered_pid(filtered_pids, task->pid);
-}
-
-void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
-				  struct task_struct *self,
-				  struct task_struct *task)
-{
-	if (!pid_list)
-		return;
-
-	/* For forks, we only add if the forking task is listed */
-	if (self) {
-		if (!trace_find_filtered_pid(pid_list, self->pid))
-			return;
-	}
-
-	/* Sorry, but we don't support pid_max changing after setting */
-	if (task->pid >= pid_list->pid_max)
-		return;
-
-	/* "self" is set for forks, and NULL for exits */
-	if (self)
-		set_bit(task->pid, pid_list->pids);
-	else
-		clear_bit(task->pid, pid_list->pids);
-}
-
 static void
 event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
 {
-- 
cgit v1.2.3


From 5cc8976bd52153678ca37cc1e3000833b20276f3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 20 Apr 2016 15:19:54 -0400
Subject: tracing: Move the pid_list seq_file functions to be global

To allow other aspects of ftrace to use the pid_list logic, we need to reuse
the seq_file functions. Making the generic part into functions that can be
called by other files will help in this regard.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 71 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h        |  3 ++
 kernel/trace/trace_events.c | 34 ++--------------------
 3 files changed, 77 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0b87fe8e6d0b..7943e306cc7f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -397,6 +397,77 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
 		clear_bit(task->pid, pid_list->pids);
 }
 
+/**
+ * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
+ * @pid_list: The pid list to show
+ * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
+ * @pos: The position of the file
+ *
+ * This is used by the seq_file "next" operation to iterate the pids
+ * listed in a trace_pid_list structure.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
+{
+	unsigned long pid = (unsigned long)v;
+
+	(*pos)++;
+
+	/* pid already is +1 of the actual prevous bit */
+	pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+	/* Return pid + 1 to allow zero to be represented */
+	if (pid < pid_list->pid_max)
+		return (void *)(pid + 1);
+
+	return NULL;
+}
+
+/**
+ * trace_pid_start - Used for seq_file to start reading pid lists
+ * @pid_list: The pid list to show
+ * @pos: The position of the file
+ *
+ * This is used by seq_file "start" operation to start the iteration
+ * of listing pids.
+ *
+ * Returns the pid+1 as we want to display pid of zero, but NULL would
+ * stop the iteration.
+ */
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
+{
+	unsigned long pid;
+	loff_t l = 0;
+
+	pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+	if (pid >= pid_list->pid_max)
+		return NULL;
+
+	/* Return pid + 1 so that zero can be the exit value */
+	for (pid++; pid && l < *pos;
+	     pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
+		;
+	return (void *)pid;
+}
+
+/**
+ * trace_pid_show - show the current pid in seq_file processing
+ * @m: The seq_file structure to write into
+ * @v: A void pointer of the pid (+1) value to display
+ *
+ * Can be directly used by seq_file operations to display the current
+ * pid value.
+ */
+int trace_pid_show(struct seq_file *m, void *v)
+{
+	unsigned long pid = (unsigned long)v - 1;
+
+	seq_printf(m, "%lu\n", pid);
+	return 0;
+}
+
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
 	u64 ts;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 172330891c6d..45442d5842f2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -636,6 +636,9 @@ bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
 void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
 				  struct task_struct *self,
 				  struct task_struct *task);
+void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
+void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
+int trace_pid_show(struct seq_file *m, void *v);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a11e6d9a3841..fd831a972bae 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -983,18 +983,8 @@ p_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct trace_array *tr = m->private;
 	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
-	unsigned long pid = (unsigned long)v;
 
-	(*pos)++;
-
-	/* pid already is +1 of the actual prevous bit */
-	pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
-
-	/* Return pid + 1 to allow zero to be represented */
-	if (pid < pid_list->pid_max)
-		return (void *)(pid + 1);
-
-	return NULL;
+	return trace_pid_next(pid_list, v, pos);
 }
 
 static void *p_start(struct seq_file *m, loff_t *pos)
@@ -1002,8 +992,6 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 {
 	struct trace_pid_list *pid_list;
 	struct trace_array *tr = m->private;
-	unsigned long pid;
-	loff_t l = 0;
 
 	/*
 	 * Grab the mutex, to keep calls to p_next() having the same
@@ -1019,15 +1007,7 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 	if (!pid_list)
 		return NULL;
 
-	pid = find_first_bit(pid_list->pids, pid_list->pid_max);
-	if (pid >= pid_list->pid_max)
-		return NULL;
-
-	/* Return pid + 1 so that zero can be the exit value */
-	for (pid++; pid && l < *pos;
-	     pid = (unsigned long)p_next(m, (void *)pid, &l))
-		;
-	return (void *)pid;
+	return trace_pid_start(pid_list, pos);
 }
 
 static void p_stop(struct seq_file *m, void *p)
@@ -1037,14 +1017,6 @@ static void p_stop(struct seq_file *m, void *p)
 	mutex_unlock(&event_mutex);
 }
 
-static int p_show(struct seq_file *m, void *v)
-{
-	unsigned long pid = (unsigned long)v - 1;
-
-	seq_printf(m, "%lu\n", pid);
-	return 0;
-}
-
 static ssize_t
 event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
@@ -1795,7 +1767,7 @@ static const struct seq_operations show_set_event_seq_ops = {
 static const struct seq_operations show_set_pid_seq_ops = {
 	.start = p_start,
 	.next = p_next,
-	.show = p_show,
+	.show = trace_pid_show,
 	.stop = p_stop,
 };
 
-- 
cgit v1.2.3


From 76c813e26606d35ea9d8d6f96e646b3944c730a9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 21 Apr 2016 11:35:30 -0400
Subject: tracing: Move pid_list write processing into its own function

The addition of PIDs into a pid_list via the write operation of
set_event_pid is a bit complex. The same operation will be needed for
function tracing pids. Move the code into its own generic function in
trace.c, so that we can avoid duplication of this code.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 109 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.h        |   7 +++
 kernel/trace/trace_events.c | 110 ++++----------------------------------------
 3 files changed, 124 insertions(+), 102 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7943e306cc7f..a8bb7485fd1d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -25,7 +25,7 @@
 #include <linux/hardirq.h>
 #include <linux/linkage.h>
 #include <linux/uaccess.h>
-#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
@@ -319,6 +319,12 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
 	return 0;
 }
 
+void trace_free_pid_list(struct trace_pid_list *pid_list)
+{
+	vfree(pid_list->pids);
+	kfree(pid_list);
+}
+
 /**
  * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
  * @filtered_pids: The list of pids to check
@@ -468,6 +474,107 @@ int trace_pid_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+/* 128 should be much more than enough */
+#define PID_BUF_SIZE		127
+
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+		    struct trace_pid_list **new_pid_list,
+		    const char __user *ubuf, size_t cnt)
+{
+	struct trace_pid_list *pid_list;
+	struct trace_parser parser;
+	unsigned long val;
+	int nr_pids = 0;
+	ssize_t read = 0;
+	ssize_t ret = 0;
+	loff_t pos;
+	pid_t pid;
+
+	if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
+		return -ENOMEM;
+
+	/*
+	 * Always recreate a new array. The write is an all or nothing
+	 * operation. Always create a new array when adding new pids by
+	 * the user. If the operation fails, then the current list is
+	 * not modified.
+	 */
+	pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+	if (!pid_list)
+		return -ENOMEM;
+
+	pid_list->pid_max = READ_ONCE(pid_max);
+
+	/* Only truncating will shrink pid_max */
+	if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+		pid_list->pid_max = filtered_pids->pid_max;
+
+	pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+	if (!pid_list->pids) {
+		kfree(pid_list);
+		return -ENOMEM;
+	}
+
+	if (filtered_pids) {
+		/* copy the current bits to the new max */
+		pid = find_first_bit(filtered_pids->pids,
+				     filtered_pids->pid_max);
+		while (pid < filtered_pids->pid_max) {
+			set_bit(pid, pid_list->pids);
+			pid = find_next_bit(filtered_pids->pids,
+					    filtered_pids->pid_max,
+					    pid + 1);
+			nr_pids++;
+		}
+	}
+
+	while (cnt > 0) {
+
+		pos = 0;
+
+		ret = trace_get_user(&parser, ubuf, cnt, &pos);
+		if (ret < 0 || !trace_parser_loaded(&parser))
+			break;
+
+		read += ret;
+		ubuf += ret;
+		cnt -= ret;
+
+		parser.buffer[parser.idx] = 0;
+
+		ret = -EINVAL;
+		if (kstrtoul(parser.buffer, 0, &val))
+			break;
+		if (val >= pid_list->pid_max)
+			break;
+
+		pid = (pid_t)val;
+
+		set_bit(pid, pid_list->pids);
+		nr_pids++;
+
+		trace_parser_clear(&parser);
+		ret = 0;
+	}
+	trace_parser_put(&parser);
+
+	if (ret < 0) {
+		trace_free_pid_list(pid_list);
+		return ret;
+	}
+
+	if (!nr_pids) {
+		/* Cleared the list of pids */
+		trace_free_pid_list(pid_list);
+		read = ret;
+		pid_list = NULL;
+	}
+
+	*new_pid_list = pid_list;
+
+	return read;
+}
+
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
 	u64 ts;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 45442d5842f2..a4dce1ef9e03 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -629,6 +629,9 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 extern unsigned long tracing_thresh;
 
 /* PID filtering */
+
+extern int pid_max;
+
 bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
 			     pid_t search_pid);
 bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
@@ -639,6 +642,10 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
 void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos);
 void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos);
 int trace_pid_show(struct seq_file *m, void *v);
+void trace_free_pid_list(struct trace_pid_list *pid_list);
+int trace_pid_write(struct trace_pid_list *filtered_pids,
+		    struct trace_pid_list **new_pid_list,
+		    const char __user *ubuf, size_t cnt);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fd831a972bae..fd449eb138cf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,6 @@
 #include <linux/kthread.h>
 #include <linux/tracefs.h>
 #include <linux/uaccess.h>
-#include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/sort.h>
@@ -499,9 +498,6 @@ static void ftrace_clear_events(struct trace_array *tr)
 	mutex_unlock(&event_mutex);
 }
 
-/* Shouldn't this be in a header? */
-extern int pid_max;
-
 static void
 event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
 {
@@ -634,8 +630,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
 	/* Wait till all users are no longer using pid filtering */
 	synchronize_sched();
 
-	vfree(pid_list->pids);
-	kfree(pid_list);
+	trace_free_pid_list(pid_list);
 }
 
 static void ftrace_clear_event_pids(struct trace_array *tr)
@@ -1587,13 +1582,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	struct trace_pid_list *filtered_pids = NULL;
 	struct trace_pid_list *pid_list;
 	struct trace_event_file *file;
-	struct trace_parser parser;
-	unsigned long val;
-	loff_t this_pos;
-	ssize_t read = 0;
-	ssize_t ret = 0;
-	pid_t pid;
-	int nr_pids = 0;
+	ssize_t ret;
 
 	if (!cnt)
 		return 0;
@@ -1602,93 +1591,15 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	if (ret < 0)
 		return ret;
 
-	if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
-		return -ENOMEM;
-
 	mutex_lock(&event_mutex);
+
 	filtered_pids = rcu_dereference_protected(tr->filtered_pids,
 					     lockdep_is_held(&event_mutex));
 
-	/*
-	 * Always recreate a new array. The write is an all or nothing
-	 * operation. Always create a new array when adding new pids by
-	 * the user. If the operation fails, then the current list is
-	 * not modified.
-	 */
-	pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
-	if (!pid_list) {
-		read = -ENOMEM;
-		goto out;
-	}
-	pid_list->pid_max = READ_ONCE(pid_max);
-	/* Only truncating will shrink pid_max */
-	if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
-		pid_list->pid_max = filtered_pids->pid_max;
-	pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
-	if (!pid_list->pids) {
-		kfree(pid_list);
-		read = -ENOMEM;
-		goto out;
-	}
-	if (filtered_pids) {
-		/* copy the current bits to the new max */
-		pid = find_first_bit(filtered_pids->pids,
-				     filtered_pids->pid_max);
-		while (pid < filtered_pids->pid_max) {
-			set_bit(pid, pid_list->pids);
-			pid = find_next_bit(filtered_pids->pids,
-					    filtered_pids->pid_max,
-					    pid + 1);
-			nr_pids++;
-		}
-	}
-
-	while (cnt > 0) {
-
-		this_pos = 0;
-
-		ret = trace_get_user(&parser, ubuf, cnt, &this_pos);
-		if (ret < 0 || !trace_parser_loaded(&parser))
-			break;
-
-		read += ret;
-		ubuf += ret;
-		cnt -= ret;
-
-		parser.buffer[parser.idx] = 0;
-
-		ret = -EINVAL;
-		if (kstrtoul(parser.buffer, 0, &val))
-			break;
-		if (val >= pid_list->pid_max)
-			break;
-
-		pid = (pid_t)val;
-
-		set_bit(pid, pid_list->pids);
-		nr_pids++;
-
-		trace_parser_clear(&parser);
-		ret = 0;
-	}
-	trace_parser_put(&parser);
-
-	if (ret < 0) {
-		vfree(pid_list->pids);
-		kfree(pid_list);
-		read = ret;
+	ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+	if (ret < 0)
 		goto out;
-	}
 
-	if (!nr_pids) {
-		/* Cleared the list of pids */
-		vfree(pid_list->pids);
-		kfree(pid_list);
-		read = ret;
-		if (!filtered_pids)
-			goto out;
-		pid_list = NULL;
-	}
 	rcu_assign_pointer(tr->filtered_pids, pid_list);
 
 	list_for_each_entry(file, &tr->events, list) {
@@ -1697,10 +1608,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 
 	if (filtered_pids) {
 		synchronize_sched();
-
-		vfree(filtered_pids->pids);
-		kfree(filtered_pids);
-	} else {
+		trace_free_pid_list(filtered_pids);
+	} else if (pid_list) {
 		/*
 		 * Register a probe that is called before all other probes
 		 * to set ignore_pid if next or prev do not match.
@@ -1738,9 +1647,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
  out:
 	mutex_unlock(&event_mutex);
 
-	ret = read;
-	if (read > 0)
-		*ppos += read;
+	if (ret > 0)
+		*ppos += ret;
 
 	return ret;
 }
-- 
cgit v1.2.3


From 345ddcc882d8896dcbdcb3e0ee4a415fc23ec8b0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 22 Apr 2016 18:11:33 -0400
Subject: ftrace: Have set_ftrace_pid use the bitmap like events do

Convert set_ftrace_pid to use the bitmap like set_event_pid does. This
allows for instances to use the pid filtering as well, and will allow for
function-fork option to set if the children of a traced function should be
traced or not.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c                | 313 +++++++++++++++--------------------
 kernel/trace/trace.c                 |   1 +
 kernel/trace/trace.h                 |  15 +-
 kernel/trace/trace_functions.c       |   2 +-
 kernel/trace/trace_functions_graph.c |   2 +-
 5 files changed, 148 insertions(+), 185 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 900dbb1efff2..8b488f4dd8e8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
 /* What to set function_trace_op to */
 static struct ftrace_ops *set_function_trace_op;
 
-/* List for set_ftrace_pid's pids. */
-LIST_HEAD(ftrace_pids);
-struct ftrace_pid {
-	struct list_head list;
-	struct pid *pid;
-};
-
-static bool ftrace_pids_enabled(void)
+static bool ftrace_pids_enabled(struct ftrace_ops *ops)
 {
-	return !list_empty(&ftrace_pids);
+	struct trace_array *tr;
+
+	if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private)
+		return false;
+
+	tr = ops->private;
+
+	return tr->function_pids != NULL;
 }
 
 static void ftrace_update_trampoline(struct ftrace_ops *ops);
@@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void)
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
 			    struct ftrace_ops *op, struct pt_regs *regs)
 {
-	if (!test_tsk_trace_trace(current))
+	struct trace_array *tr = op->private;
+
+	if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid))
 		return;
 
 	op->saved_func(ip, parent_ip, op, regs);
@@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	/* Always save the function, and reset at unregistering */
 	ops->saved_func = ops->func;
 
-	if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
+	if (ftrace_pids_enabled(ops))
 		ops->func = ftrace_pid_func;
 
 	ftrace_update_trampoline(ops);
@@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 
 static void ftrace_update_pid_func(void)
 {
-	bool enabled = ftrace_pids_enabled();
 	struct ftrace_ops *op;
 
 	/* Only do something if we are tracing something */
@@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void)
 
 	do_for_each_ftrace_op(op, ftrace_ops_list) {
 		if (op->flags & FTRACE_OPS_FL_PID) {
-			op->func = enabled ? ftrace_pid_func :
-				op->saved_func;
+			op->func = ftrace_pids_enabled(op) ?
+				ftrace_pid_func : op->saved_func;
 			ftrace_update_trampoline(op);
 		}
 	} while_for_each_ftrace_op(op);
@@ -5324,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 	return ops->func;
 }
 
-static void clear_ftrace_swapper(void)
+static void
+ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
+		    struct task_struct *prev, struct task_struct *next)
 {
-	struct task_struct *p;
-	int cpu;
+	struct trace_array *tr = data;
+	struct trace_pid_list *pid_list;
 
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		p = idle_task(cpu);
-		clear_tsk_trace_trace(p);
-	}
-	put_online_cpus();
-}
-
-static void set_ftrace_swapper(void)
-{
-	struct task_struct *p;
-	int cpu;
+	pid_list = rcu_dereference_sched(tr->function_pids);
 
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		p = idle_task(cpu);
-		set_tsk_trace_trace(p);
-	}
-	put_online_cpus();
+	this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+		       trace_ignore_this_task(pid_list, next));
 }
 
-static void clear_ftrace_pid(struct pid *pid)
+static void clear_ftrace_pids(struct trace_array *tr)
 {
-	struct task_struct *p;
+	struct trace_pid_list *pid_list;
+	int cpu;
 
-	rcu_read_lock();
-	do_each_pid_task(pid, PIDTYPE_PID, p) {
-		clear_tsk_trace_trace(p);
-	} while_each_pid_task(pid, PIDTYPE_PID, p);
-	rcu_read_unlock();
+	pid_list = rcu_dereference_protected(tr->function_pids,
+					     lockdep_is_held(&ftrace_lock));
+	if (!pid_list)
+		return;
 
-	put_pid(pid);
-}
+	unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
 
-static void set_ftrace_pid(struct pid *pid)
-{
-	struct task_struct *p;
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false;
 
-	rcu_read_lock();
-	do_each_pid_task(pid, PIDTYPE_PID, p) {
-		set_tsk_trace_trace(p);
-	} while_each_pid_task(pid, PIDTYPE_PID, p);
-	rcu_read_unlock();
-}
+	rcu_assign_pointer(tr->function_pids, NULL);
 
-static void clear_ftrace_pid_task(struct pid *pid)
-{
-	if (pid == ftrace_swapper_pid)
-		clear_ftrace_swapper();
-	else
-		clear_ftrace_pid(pid);
-}
+	/* Wait till all users are no longer using pid filtering */
+	synchronize_sched();
 
-static void set_ftrace_pid_task(struct pid *pid)
-{
-	if (pid == ftrace_swapper_pid)
-		set_ftrace_swapper();
-	else
-		set_ftrace_pid(pid);
+	trace_free_pid_list(pid_list);
 }
 
-static int ftrace_pid_add(int p)
+static void ftrace_pid_reset(struct trace_array *tr)
 {
-	struct pid *pid;
-	struct ftrace_pid *fpid;
-	int ret = -EINVAL;
-
 	mutex_lock(&ftrace_lock);
-
-	if (!p)
-		pid = ftrace_swapper_pid;
-	else
-		pid = find_get_pid(p);
-
-	if (!pid)
-		goto out;
-
-	ret = 0;
-
-	list_for_each_entry(fpid, &ftrace_pids, list)
-		if (fpid->pid == pid)
-			goto out_put;
-
-	ret = -ENOMEM;
-
-	fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
-	if (!fpid)
-		goto out_put;
-
-	list_add(&fpid->list, &ftrace_pids);
-	fpid->pid = pid;
-
-	set_ftrace_pid_task(pid);
+	clear_ftrace_pids(tr);
 
 	ftrace_update_pid_func();
-
 	ftrace_startup_all(0);
 
 	mutex_unlock(&ftrace_lock);
-	return 0;
-
-out_put:
-	if (pid != ftrace_swapper_pid)
-		put_pid(pid);
-
-out:
-	mutex_unlock(&ftrace_lock);
-	return ret;
 }
 
-static void ftrace_pid_reset(void)
-{
-	struct ftrace_pid *fpid, *safe;
-
-	mutex_lock(&ftrace_lock);
-	list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
-		struct pid *pid = fpid->pid;
-
-		clear_ftrace_pid_task(pid);
-
-		list_del(&fpid->list);
-		kfree(fpid);
-	}
-
-	ftrace_update_pid_func();
-	ftrace_startup_all(0);
-
-	mutex_unlock(&ftrace_lock);
-}
+/* Greater than any max PID */
+#define FTRACE_NO_PIDS		(void *)(PID_MAX_LIMIT + 1)
 
 static void *fpid_start(struct seq_file *m, loff_t *pos)
+	__acquires(RCU)
 {
+	struct trace_pid_list *pid_list;
+	struct trace_array *tr = m->private;
+
 	mutex_lock(&ftrace_lock);
+	rcu_read_lock_sched();
 
-	if (!ftrace_pids_enabled() && (!*pos))
-		return (void *) 1;
+	pid_list = rcu_dereference_sched(tr->function_pids);
 
-	return seq_list_start(&ftrace_pids, *pos);
+	if (!pid_list)
+		return !(*pos) ? FTRACE_NO_PIDS : NULL;
+
+	return trace_pid_start(pid_list, pos);
 }
 
 static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	if (v == (void *)1)
+	struct trace_array *tr = m->private;
+	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
+
+	if (v == FTRACE_NO_PIDS)
 		return NULL;
 
-	return seq_list_next(v, &ftrace_pids, pos);
+	return trace_pid_next(pid_list, v, pos);
 }
 
 static void fpid_stop(struct seq_file *m, void *p)
+	__releases(RCU)
 {
+	rcu_read_unlock_sched();
 	mutex_unlock(&ftrace_lock);
 }
 
 static int fpid_show(struct seq_file *m, void *v)
 {
-	const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
-
-	if (v == (void *)1) {
+	if (v == FTRACE_NO_PIDS) {
 		seq_puts(m, "no pid\n");
 		return 0;
 	}
 
-	if (fpid->pid == ftrace_swapper_pid)
-		seq_puts(m, "swapper tasks\n");
-	else
-		seq_printf(m, "%u\n", pid_vnr(fpid->pid));
-
-	return 0;
+	return trace_pid_show(m, v);
 }
 
 static const struct seq_operations ftrace_pid_sops = {
@@ -5509,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = {
 static int
 ftrace_pid_open(struct inode *inode, struct file *file)
 {
+	struct trace_array *tr = inode->i_private;
+	struct seq_file *m;
 	int ret = 0;
 
+	if (trace_array_get(tr) < 0)
+		return -ENODEV;
+
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC))
-		ftrace_pid_reset();
+		ftrace_pid_reset(tr);
 
-	if (file->f_mode & FMODE_READ)
-		ret = seq_open(file, &ftrace_pid_sops);
+	ret = seq_open(file, &ftrace_pid_sops);
+	if (ret < 0) {
+		trace_array_put(tr);
+	} else {
+		m = file->private_data;
+		/* copy tr over to seq ops */
+		m->private = tr;
+	}
 
 	return ret;
 }
 
+static void ignore_task_cpu(void *data)
+{
+	struct trace_array *tr = data;
+	struct trace_pid_list *pid_list;
+
+	/*
+	 * This function is called by on_each_cpu() while the
+	 * event_mutex is held.
+	 */
+	pid_list = rcu_dereference_protected(tr->function_pids,
+					     mutex_is_locked(&ftrace_lock));
+
+	this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid,
+		       trace_ignore_this_task(pid_list, current));
+}
+
 static ssize_t
 ftrace_pid_write(struct file *filp, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
-	char buf[64], *tmp;
-	long val;
-	int ret;
+	struct seq_file *m = filp->private_data;
+	struct trace_array *tr = m->private;
+	struct trace_pid_list *filtered_pids = NULL;
+	struct trace_pid_list *pid_list;
+	ssize_t ret;
 
-	if (cnt >= sizeof(buf))
-		return -EINVAL;
+	if (!cnt)
+		return 0;
+
+	mutex_lock(&ftrace_lock);
+
+	filtered_pids = rcu_dereference_protected(tr->function_pids,
+					     lockdep_is_held(&ftrace_lock));
+
+	ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
+	if (ret < 0)
+		goto out;
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
+	rcu_assign_pointer(tr->function_pids, pid_list);
 
-	buf[cnt] = 0;
+	if (filtered_pids) {
+		synchronize_sched();
+		trace_free_pid_list(filtered_pids);
+	} else if (pid_list) {
+		/* Register a probe to set whether to ignore the tracing of a task */
+		register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr);
+	}
 
 	/*
-	 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
-	 * to clean the filter quietly.
+	 * Ignoring of pids is done at task switch. But we have to
+	 * check for those tasks that are currently running.
+	 * Always do this in case a pid was appended or removed.
 	 */
-	tmp = strstrip(buf);
-	if (strlen(tmp) == 0)
-		return 1;
+	on_each_cpu(ignore_task_cpu, tr, 1);
 
-	ret = kstrtol(tmp, 10, &val);
-	if (ret < 0)
-		return ret;
+	ftrace_update_pid_func();
+	ftrace_startup_all(0);
+ out:
+	mutex_unlock(&ftrace_lock);
 
-	ret = ftrace_pid_add(val);
+	if (ret > 0)
+		*ppos += ret;
 
-	return ret ? ret : cnt;
+	return ret;
 }
 
 static int
 ftrace_pid_release(struct inode *inode, struct file *file)
 {
-	if (file->f_mode & FMODE_READ)
-		seq_release(inode, file);
+	struct trace_array *tr = inode->i_private;
 
-	return 0;
+	trace_array_put(tr);
+
+	return seq_release(inode, file);
 }
 
 static const struct file_operations ftrace_pid_fops = {
@@ -5571,24 +5537,17 @@ static const struct file_operations ftrace_pid_fops = {
 	.release	= ftrace_pid_release,
 };
 
-static __init int ftrace_init_tracefs(void)
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
-	struct dentry *d_tracer;
-
-	d_tracer = tracing_init_dentry();
-	if (IS_ERR(d_tracer))
-		return 0;
-
-	ftrace_init_dyn_tracefs(d_tracer);
+	/* Only the top level directory has the dyn_tracefs and profile */
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+		ftrace_init_dyn_tracefs(d_tracer);
+		ftrace_profile_tracefs(d_tracer);
+	}
 
 	trace_create_file("set_ftrace_pid", 0644, d_tracer,
-			    NULL, &ftrace_pid_fops);
-
-	ftrace_profile_tracefs(d_tracer);
-
-	return 0;
+			    tr, &ftrace_pid_fops);
 }
-fs_initcall(ftrace_init_tracefs);
 
 /**
  * ftrace_kill - kill ftrace
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a8bb7485fd1d..aa240551fc5d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7233,6 +7233,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	for_each_tracing_cpu(cpu)
 		tracing_init_tracefs_percpu(tr, cpu);
 
+	ftrace_init_tracefs(tr, d_tracer);
 }
 
 static struct vfsmount *trace_automount(void *ingore)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a4dce1ef9e03..eaee458755a4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -156,6 +156,9 @@ struct trace_array_cpu {
 	char			comm[TASK_COMM_LEN];
 
 	bool			ignore_pid;
+#ifdef CONFIG_FUNCTION_TRACER
+	bool			ftrace_ignore_pid;
+#endif
 };
 
 struct tracer;
@@ -247,6 +250,7 @@ struct trace_array {
 	int			ref;
 #ifdef CONFIG_FUNCTION_TRACER
 	struct ftrace_ops	*ops;
+	struct trace_pid_list	__rcu *function_pids;
 	/* function tracing enabled */
 	int			function_enabled;
 #endif
@@ -840,12 +844,9 @@ extern struct list_head ftrace_pids;
 
 #ifdef CONFIG_FUNCTION_TRACER
 extern bool ftrace_filter_param __initdata;
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
 {
-	if (list_empty(&ftrace_pids))
-		return 1;
-
-	return test_tsk_trace_trace(task);
+	return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid);
 }
 extern int ftrace_is_dead(void);
 int ftrace_create_function_files(struct trace_array *tr,
@@ -855,8 +856,9 @@ void ftrace_init_global_array_ops(struct trace_array *tr);
 void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
 void ftrace_reset_array_ops(struct trace_array *tr);
 int using_ftrace_ops_list_func(void);
+void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
 #else
-static inline int ftrace_trace_task(struct task_struct *task)
+static inline int ftrace_trace_task(struct trace_array *tr)
 {
 	return 1;
 }
@@ -871,6 +873,7 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
 static inline __init void
 ftrace_init_global_array_ops(struct trace_array *tr) { }
 static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5a095c2e4b69..0efa00d80623 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr)
 
 	/* Currently only the non stack verision is supported */
 	ops->func = function_trace_call;
-	ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
+	ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
 
 	tr->ops = ops;
 	ops->private = tr;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3a0244ff7ea8..67cce7896aeb 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	int cpu;
 	int pc;
 
-	if (!ftrace_trace_task(current))
+	if (!ftrace_trace_task(tr))
 		return 0;
 
 	/* trace it when it is-nested-in or is a function enabled. */
-- 
cgit v1.2.3


From 35abb67de744b5dbaec54381f2f9e0246089331d Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 8 Jun 2016 18:38:02 -0700
Subject: tracing: expose current->comm to [ku]probe events

ftrace is very quick to give up on saving the task command line (see
`trace_save_cmdline()`). The workaround for events which really care
about the command line is to explicitly assign it as part of the entry.
However, this doesn't work for kprobe events, as there's no
straightforward way to get access to current->comm. Add a kprobe/uprobe
event variable $comm which provides exactly that.

Link: http://lkml.kernel.org/r/f59b472033b943a370f5f48d0af37698f409108f.1465435894.git.osandov@fb.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c |  1 +
 kernel/trace/trace_probe.c  | 33 +++++++++++++++++++++++++++++++++
 kernel/trace/trace_probe.h  | 10 ++++++++++
 3 files changed, 44 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5546eec0505f..9aedb0b06683 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv)
 	 *  $retval	: fetch return value
 	 *  $stack	: fetch stack address
 	 *  $stackN	: fetch Nth of stack (N:0-)
+	 *  $comm       : fetch current task comm
 	 *  @ADDR	: fetch memory at ADDR (ADDR should be in kernel)
 	 *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
 	 *  %REG	: fetch register REG
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 1d372fa6fefb..74e80a582c28 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
 	kfree(data);
 }
 
+void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs,
+					  void *data, void *dest)
+{
+	int maxlen = get_rloc_len(*(u32 *)dest);
+	u8 *dst = get_rloc_data(dest);
+	long ret;
+
+	if (!maxlen)
+		return;
+
+	ret = strlcpy(dst, current->comm, maxlen);
+	*(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string));
+
+void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs,
+					       void *data, void *dest)
+{
+	*(u32 *)dest = strlen(current->comm) + 1;
+}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size));
+
 static const struct fetch_type *find_fetch_type(const char *type,
 						const struct fetch_type *ftbl)
 {
@@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
 			}
 		} else
 			ret = -EINVAL;
+	} else if (strcmp(arg, "comm") == 0) {
+		if (strcmp(t->name, "string") != 0 &&
+		    strcmp(t->name, "string_size") != 0)
+			return -EINVAL;
+		f->fn = t->fetch[FETCH_MTD_comm];
 	} else
 		ret = -EINVAL;
 
@@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
 		arg[t - parg->comm] = '\0';
 		t++;
 	}
+	/*
+	 * The default type of $comm should be "string", and it can't be
+	 * dereferenced.
+	 */
+	if (!t && strcmp(arg, "$comm") == 0)
+		t = "string";
 	parg->type = find_fetch_type(t, ftbl);
 	if (!parg->type) {
 		pr_info("Unsupported type: %s\n", t);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index f6398db09114..45400ca5ded1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -102,6 +102,7 @@ enum {
 	FETCH_MTD_reg = 0,
 	FETCH_MTD_stack,
 	FETCH_MTD_retval,
+	FETCH_MTD_comm,
 	FETCH_MTD_memory,
 	FETCH_MTD_symbol,
 	FETCH_MTD_deref,
@@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield);
 #define fetch_bitfield_string			NULL
 #define fetch_bitfield_string_size		NULL
 
+/* comm only makes sense as a string */
+#define fetch_comm_u8		NULL
+#define fetch_comm_u16		NULL
+#define fetch_comm_u32		NULL
+#define fetch_comm_u64		NULL
+DECLARE_FETCH_FUNC(comm, string);
+DECLARE_FETCH_FUNC(comm, string_size);
+
 /*
  * Define macro for basic types - we don't need to define s* types, because
  * we have to care only about bitwidth at recording time.
@@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64)
 ASSIGN_FETCH_FUNC(reg, ftype),				\
 ASSIGN_FETCH_FUNC(stack, ftype),			\
 ASSIGN_FETCH_FUNC(retval, ftype),			\
+ASSIGN_FETCH_FUNC(comm, ftype),				\
 ASSIGN_FETCH_FUNC(memory, ftype),			\
 ASSIGN_FETCH_FUNC(symbol, ftype),			\
 ASSIGN_FETCH_FUNC(deref, ftype),			\
-- 
cgit v1.2.3


From e2ace001176dc9745a472fe8bda1f0b28a4d7351 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 26 May 2016 12:00:33 -0700
Subject: tracing: Choose static tp_printk buffer by explicit nesting count

Currently, the trace_printk code chooses which static buffer to use based
on what type of atomic context (NMI, IRQ, etc) it's in.  Simplify the
code and make it more robust: simply count the nesting depth and choose
a buffer based on the current nesting depth.

The new code will only drop an event if we nest more than 4 deep,
and the old code was guaranteed to malfunction if that happened.

Link: http://lkml.kernel.org/r/07ab03aecfba25fcce8f9a211b14c9c5e2865c58.1464289095.git.luto@kernel.org

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 83 +++++++++++++++-------------------------------------
 1 file changed, 24 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aa240551fc5d..45e6747589c6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2339,83 +2339,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 
 /* created for use with alloc_percpu */
 struct trace_buffer_struct {
-	char buffer[TRACE_BUF_SIZE];
+	int nesting;
+	char buffer[4][TRACE_BUF_SIZE];
 };
 
 static struct trace_buffer_struct *trace_percpu_buffer;
-static struct trace_buffer_struct *trace_percpu_sirq_buffer;
-static struct trace_buffer_struct *trace_percpu_irq_buffer;
-static struct trace_buffer_struct *trace_percpu_nmi_buffer;
 
 /*
- * The buffer used is dependent on the context. There is a per cpu
- * buffer for normal context, softirq contex, hard irq context and
- * for NMI context. Thise allows for lockless recording.
- *
- * Note, if the buffers failed to be allocated, then this returns NULL
+ * Thise allows for lockless recording.  If we're nested too deeply, then
+ * this returns NULL.
  */
 static char *get_trace_buf(void)
 {
-	struct trace_buffer_struct *percpu_buffer;
-
-	/*
-	 * If we have allocated per cpu buffers, then we do not
-	 * need to do any locking.
-	 */
-	if (in_nmi())
-		percpu_buffer = trace_percpu_nmi_buffer;
-	else if (in_irq())
-		percpu_buffer = trace_percpu_irq_buffer;
-	else if (in_softirq())
-		percpu_buffer = trace_percpu_sirq_buffer;
-	else
-		percpu_buffer = trace_percpu_buffer;
+	struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
 
-	if (!percpu_buffer)
+	if (!buffer || buffer->nesting >= 4)
 		return NULL;
 
-	return this_cpu_ptr(&percpu_buffer->buffer[0]);
+	return &buffer->buffer[buffer->nesting++][0];
+}
+
+static void put_trace_buf(void)
+{
+	this_cpu_dec(trace_percpu_buffer->nesting);
 }
 
 static int alloc_percpu_trace_buffer(void)
 {
 	struct trace_buffer_struct *buffers;
-	struct trace_buffer_struct *sirq_buffers;
-	struct trace_buffer_struct *irq_buffers;
-	struct trace_buffer_struct *nmi_buffers;
 
 	buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!buffers)
-		goto err_warn;
-
-	sirq_buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!sirq_buffers)
-		goto err_sirq;
-
-	irq_buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!irq_buffers)
-		goto err_irq;
-
-	nmi_buffers = alloc_percpu(struct trace_buffer_struct);
-	if (!nmi_buffers)
-		goto err_nmi;
+	if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
+		return -ENOMEM;
 
 	trace_percpu_buffer = buffers;
-	trace_percpu_sirq_buffer = sirq_buffers;
-	trace_percpu_irq_buffer = irq_buffers;
-	trace_percpu_nmi_buffer = nmi_buffers;
-
 	return 0;
-
- err_nmi:
-	free_percpu(irq_buffers);
- err_irq:
-	free_percpu(sirq_buffers);
- err_sirq:
-	free_percpu(buffers);
- err_warn:
-	WARN(1, "Could not allocate percpu trace_printk buffer");
-	return -ENOMEM;
 }
 
 static int buffers_allocated;
@@ -2506,7 +2464,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	tbuffer = get_trace_buf();
 	if (!tbuffer) {
 		len = 0;
-		goto out;
+		goto out_nobuffer;
 	}
 
 	len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);
@@ -2532,6 +2490,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	}
 
 out:
+	put_trace_buf();
+
+out_nobuffer:
 	preempt_enable_notrace();
 	unpause_graph_tracing();
 
@@ -2563,7 +2524,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
 	tbuffer = get_trace_buf();
 	if (!tbuffer) {
 		len = 0;
-		goto out;
+		goto out_nobuffer;
 	}
 
 	len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
@@ -2582,7 +2543,11 @@ __trace_array_vprintk(struct ring_buffer *buffer,
 		__buffer_unlock_commit(buffer, event);
 		ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
 	}
- out:
+
+out:
+	put_trace_buf();
+
+out_nobuffer:
 	preempt_enable_notrace();
 	unpause_graph_tracing();
 
-- 
cgit v1.2.3


From e947841c0dce9db675a957182214ef8091ac3d61 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 17 Jun 2016 17:40:58 -0400
Subject: tracing: Show the preempt count of when the event was called

Because tracepoint callbacks are done with preemption enabled, the trace
events are always called with preempt disable due to the
rcu_read_lock_sched_notrace() in __DO_TRACE(). This causes the preempt count
shown in the recorded trace event to be inaccurate. It is always one more
that what the preempt_count was when the tracepoint was called.

If CONFIG_PREEMPT is enabled, subtract 1 from the preempt_count before
recording it in the trace buffer.

Link: http://lkml.kernel.org/r/20160525132537.GA10808@linutronix.de

Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fd449eb138cf..03c0a48c3ac4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -261,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
 
 	local_save_flags(fbuffer->flags);
 	fbuffer->pc = preempt_count();
+	/*
+	 * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables
+	 * preemption (adding one to the preempt_count). Since we are
+	 * interested in the preempt_count at the time the tracepoint was
+	 * hit, we need to subtract one to offset the increment.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT))
+		fbuffer->pc--;
 	fbuffer->trace_file = trace_file;
 
 	fbuffer->event =
-- 
cgit v1.2.3


From 9a51933e360897d9b3867c9b09dd5ccf7493e97e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 11 May 2016 14:06:57 -0500
Subject: tracing: Expose CPU physical addresses (resource values) for PCI
 devices

Previously, mmio_print_pcidev() put "user" addresses in the trace buffer.
On most architectures, these are the same as CPU physical addresses, but on
microblaze, mips, powerpc, and sparc, they may be something else, typically
a raw BAR value (a bus address as opposed to a CPU address).

Always expose the CPU physical address to avoid this arch-dependent
behavior.

This change should have no user-visible effect because this file currently
depends on CONFIG_HAVE_MMIOTRACE_SUPPORT, which is only defined for x86,
and pci_resource_to_user() is a no-op on x86.

Link: http://lkml.kernel.org/r/20160511190657.5898.4248.stgit@bhelgaas-glaptop2.roam.corp.google.com

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_mmiotrace.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 68f376ca6d3f..cd7480d0a201 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
 	trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
 			 dev->bus->number, dev->devfn,
 			 dev->vendor, dev->device, dev->irq);
-	/*
-	 * XXX: is pci_resource_to_user() appropriate, since we are
-	 * supposed to interpret the __ioremap() phys_addr argument based on
-	 * these printed values?
-	 */
 	for (i = 0; i < 7; i++) {
-		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		start = dev->resource[i].start;
 		trace_seq_printf(s, " %llx",
 			(unsigned long long)(start |
 			(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
 	}
 	for (i = 0; i < 7; i++) {
-		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		start = dev->resource[i].start;
+		end = dev->resource[i].end;
 		trace_seq_printf(s, " %llx",
 			dev->resource[i].start < dev->resource[i].end ?
 			(unsigned long long)(end - start) + 1 : 0);
-- 
cgit v1.2.3


From e7e15b87f86d4a48c270b81cf027eafd801e5b89 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Jun 2016 13:06:24 -0400
Subject: cgroup: allow NULL return from ss->css_alloc()

cgroup core expected css_alloc to return an ERR_PTR value on failure
and caused NULL deref if it returned NULL.  It's an easy mistake to
make from an alloc function and there's no ambiguity in what's being
indicated.  Update css_create() so that it interprets NULL return from
css_alloc as -ENOMEM.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 78f6d18ff0af..dd26e1bb7222 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5133,6 +5133,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 	lockdep_assert_held(&cgroup_mutex);
 
 	css = ss->css_alloc(parent_css);
+	if (!css)
+		css = ERR_PTR(-ENOMEM);
 	if (IS_ERR(css))
 		return css;
 
-- 
cgit v1.2.3


From 135b8b37bd91cc82f83e98fca109b80375f5317e Mon Sep 17 00:00:00 2001
From: Kenny Yu <kennyyu@fb.com>
Date: Tue, 21 Jun 2016 14:04:36 -0400
Subject: cgroup: Add pids controller event when fork fails because of pid
 limit

This patch adds more visibility into the pids controller when the controller
rejects a fork request. Whenever fork fails because the limit on the number of
pids in the cgroup is reached, the controller will log this and also notify the
newly added cgroups events file. The `max` key in the events file represents
the number of times fork failed because of the pids controller.

This change also logs only the first time the `max` event counter is
incremented. This is to provide a hint to the user to understand why fork
failed, as users are not yet used to seeing fork failures because of the
pids controller.

Signed-off-by: Kenny Yu <kennyyu@fb.com>
Acked-by: Johannes Weiner <hannes <at> cmpxchg.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup_pids.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 303097b37429..9740ea6762de 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -49,6 +49,12 @@ struct pids_cgroup {
 	 */
 	atomic64_t			counter;
 	int64_t				limit;
+
+	/* Handle for "pids.events" */
+	struct cgroup_file		events_file;
+
+	/* Number of times fork failed because limit was hit. */
+	atomic64_t			events_limit;
 };
 
 static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
 
 	pids->limit = PIDS_MAX;
 	atomic64_set(&pids->counter, 0);
+	atomic64_set(&pids->events_limit, 0);
 	return &pids->css;
 }
 
@@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task)
 {
 	struct cgroup_subsys_state *css;
 	struct pids_cgroup *pids;
+	int err;
 
 	css = task_css_check(current, pids_cgrp_id, true);
 	pids = css_pids(css);
-	return pids_try_charge(pids, 1);
+	err = pids_try_charge(pids, 1);
+	if (err) {
+		/* Only log the first time events_limit is incremented. */
+		if (atomic64_inc_return(&pids->events_limit) == 1) {
+			pr_info("cgroup: fork rejected by pids controller in ");
+			pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
+			pr_cont("\n");
+		}
+		cgroup_file_notify(&pids->events_file);
+	}
+	return err;
 }
 
 static void pids_cancel_fork(struct task_struct *task)
@@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
 	return atomic64_read(&pids->counter);
 }
 
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+	struct pids_cgroup *pids = css_pids(seq_css(sf));
+
+	seq_printf(sf, "max %ld\n", atomic64_read(&pids->events_limit));
+	return 0;
+}
+
 static struct cftype pids_files[] = {
 	{
 		.name = "max",
@@ -300,6 +326,12 @@ static struct cftype pids_files[] = {
 		.read_s64 = pids_current_read,
 		.flags = CFTYPE_NOT_ON_ROOT,
 	},
+	{
+		.name = "events",
+		.seq_show = pids_events_show,
+		.file_offset = offsetof(struct pids_cgroup, events_file),
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
 	{ }	/* terminate */
 };
 
-- 
cgit v1.2.3


From 9f6870dd9790dd87da1d0cf9e43e60113f3a278d Mon Sep 17 00:00:00 2001
From: Kenny Yu <kennyyu@fb.com>
Date: Tue, 21 Jun 2016 11:55:35 -0700
Subject: cgroup: Use lld instead of ld when printing pids controller
 events_limit

The `events_limit` variable needs to be formatted with %lld and not %ld.
This fixes the following warning discovered by kbuild test robot:

   kernel/cgroup_pids.c: In function 'pids_events_show':
   kernel/cgroup_pids.c:313:24: warning: format '%ld' expects argument of type
   'long int', but argument 3 has type 'long long int' [-Wformat=]
        seq_printf(sf, "max %ld\n", atomic64_read(&pids->events_limit));
                                   ^

tj: Added explicit (s64) cast as atomic64 switches between long long
    and long depending on 32 or 64.

Signed-off-by: Kenny Yu <kennyyu@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup_pids.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 9740ea6762de..2bd673783f1a 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -310,7 +310,7 @@ static int pids_events_show(struct seq_file *sf, void *v)
 {
 	struct pids_cgroup *pids = css_pids(seq_css(sf));
 
-	seq_printf(sf, "max %ld\n", atomic64_read(&pids->events_limit));
+	seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
 	return 0;
 }
 
-- 
cgit v1.2.3


From be54f69c26193de31053190761e521903b89d098 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 23 Jun 2016 14:03:47 -0400
Subject: tracing: Skip more functions when doing stack tracing of events

 # echo 1 > options/stacktrace
 # echo 1 > events/sched/sched_switch/enable
 # cat trace
          <idle>-0     [002] d..2  1982.525169: <stack trace>
 => save_stack_trace
 => __ftrace_trace_stack
 => trace_buffer_unlock_commit_regs
 => event_trigger_unlock_commit
 => trace_event_buffer_commit
 => trace_event_raw_event_sched_switch
 => __schedule
 => schedule
 => schedule_preempt_disabled
 => cpu_startup_entry
 => start_secondary

The above shows that we are seeing 6 functions before ever making it to the
caller of the sched_switch event.

 # echo stacktrace > events/sched/sched_switch/trigger
 # cat trace
          <idle>-0     [002] d..3  2146.335208: <stack trace>
 => trace_event_buffer_commit
 => trace_event_raw_event_sched_switch
 => __schedule
 => schedule
 => schedule_preempt_disabled
 => cpu_startup_entry
 => start_secondary

The stacktrace trigger isn't as bad, because it adds its own skip to the
stacktracing, but still has two events extra.

One issue is that if the stacktrace passes its own "regs" then there should
be no addition to the skip, as the regs will not include the functions being
called. This was an issue that was fixed by commit 7717c6be6999 ("tracing:
Fix stacktrace skip depth in trace_buffer_unlock_commit_regs()" as adding
the skip number for kprobes made the probes not have any stack at all.

But since this is only an issue when regs is being used, a skip should be
added if regs is NULL. Now we have:

 # echo 1 > options/stacktrace
 # echo 1 > events/sched/sched_switch/enable
 # cat trace
          <idle>-0     [000] d..2  1297.676333: <stack trace>
 => __schedule
 => schedule
 => schedule_preempt_disabled
 => cpu_startup_entry
 => rest_init
 => start_kernel
 => x86_64_start_reservations
 => x86_64_start_kernel

 # echo stacktrace > events/sched/sched_switch/trigger
 # cat trace
          <idle>-0     [002] d..3  1370.759745: <stack trace>
 => __schedule
 => schedule
 => schedule_preempt_disabled
 => cpu_startup_entry
 => start_secondary

And kprobes are not touched.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 45e6747589c6..3d9f31b576f3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2118,7 +2118,17 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 {
 	__buffer_unlock_commit(buffer, event);
 
-	ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
+	/*
+	 * If regs is not set, then skip the following callers:
+	 *   trace_buffer_unlock_commit_regs
+	 *   event_trigger_unlock_commit
+	 *   trace_event_buffer_commit
+	 *   trace_event_raw_event_sched_switch
+	 * Note, we can still get here via blktrace, wakeup tracer
+	 * and mmiotrace, but that's ok if they lose a function or
+	 * two. They are that meaningful.
+	 */
+	ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
 	ftrace_trace_userstack(buffer, flags, pc);
 }
 
@@ -2168,6 +2178,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 	trace.nr_entries	= 0;
 	trace.skip		= skip;
 
+	/*
+	 * Add two, for this function and the call to save_stack_trace()
+	 * If regs is set, then these functions will not be in the way.
+	 */
+	if (!regs)
+		trace.skip += 2;
+
 	/*
 	 * Since events can happen in NMIs there's no safe way to
 	 * use the per cpu ftrace_stacks. We reserve it and if an interrupt
-- 
cgit v1.2.3


From d07b846f6200454c50d791796edb82660192513d Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Wed, 23 Sep 2015 15:16:04 -0500
Subject: fs: Limit file caps to the user namespace of the super block

Capability sets attached to files must be ignored except in the
user namespaces where the mounter is privileged, i.e. s_user_ns
and its descendants. Otherwise a vector exists for gaining
privileges in namespaces where a user is not already privileged.

Add a new helper function, current_in_user_ns(), to test whether a user
namespace is the same as or a descendant of another namespace.
Use this helper to determine whether a file's capability set
should be applied to the caps constructed during exec.

--EWB Replaced in_userns with the simpler current_in_userns.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9bafc211930c..68f594212759 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns)
 	return allowed;
 }
 
+/*
+ * Returns true if @ns is the same namespace as or a descendant of
+ * @target_ns.
+ */
+bool current_in_userns(const struct user_namespace *target_ns)
+{
+	struct user_namespace *ns;
+	for (ns = current_user_ns(); ns; ns = ns->parent) {
+		if (ns == target_ns)
+			return true;
+	}
+	return false;
+}
+
 static inline struct user_namespace *to_user_ns(struct ns_common *ns)
 {
 	return container_of(ns, struct user_namespace, ns);
-- 
cgit v1.2.3


From f295e53b60eb93ee53ed5ac610374ed293caa57b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 17 Jun 2016 11:08:06 -0700
Subject: libnvdimm, pmem: allow nfit_test to override pmem_direct_access()

Currently phys_to_pfn_t() is an exported symbol to allow nfit_test to
override it and indicate that nfit_test-pmem is not device-mapped.  Now,
we want to enable nfit_test to operate without DMA_CMA and the pmem it
provides will no longer be physically contiguous, i.e. won't be capable
of supporting direct_access requests larger than a page.  Make
pmem_direct_access() a weak symbol so that it can be replaced by the
tools/testing/nvdimm/ version, and move phys_to_pfn_t() to a static
inline now that it no longer needs to be overridden.

Acked-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 kernel/memremap.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 017532193fb1..852e5266124a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -169,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr)
 }
 EXPORT_SYMBOL(devm_memunmap);
 
-pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
-{
-	return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
-}
-EXPORT_SYMBOL(phys_to_pfn_t);
-
 #ifdef CONFIG_ZONE_DEVICE
 static DEFINE_MUTEX(pgmap_lock);
 static RADIX_TREE(pgmap_radix, GFP_KERNEL);
-- 
cgit v1.2.3


From 86b2efbe3a390e07dbba725ef700b0d143e9a385 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 24 Jun 2016 16:35:46 -0400
Subject: audit: add fields to exclude filter by reusing user filter

RFE: add additional fields for use in audit filter exclude rules
https://github.com/linux-audit/audit-kernel/issues/5

Re-factor and combine audit_filter_type() with audit_filter_user() to
use audit_filter_user_rules() to enable the exclude filter to
additionally filter on PID, UID, GID, AUID, LOGINUID_SET, SUBJ_*.

The process of combining the similar audit_filter_user() and
audit_filter_type() functions, required inverting the meaning and
including the ALWAYS action of the latter.

Include audit_filter_user_rules() into audit_filter(), removing
unneeded logic in the process.

Keep the check to quit early if the list is empty.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: checkpatch.pl fixes - whitespace damage, wrapped description]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c       |   4 +-
 kernel/audit.h       |   2 +
 kernel/auditfilter.c | 151 ++++++++++++++++++---------------------------------
 3 files changed, 57 insertions(+), 100 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 678c3f000191..994588ef9489 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -934,7 +934,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (!audit_enabled && msg_type != AUDIT_USER_AVC)
 			return 0;
 
-		err = audit_filter_user(msg_type);
+		err = audit_filter(msg_type, AUDIT_FILTER_USER);
 		if (err == 1) { /* match or error */
 			err = 0;
 			if (msg_type == AUDIT_USER_TTY) {
@@ -1382,7 +1382,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	if (audit_initialized != AUDIT_INITIALIZED)
 		return NULL;
 
-	if (unlikely(audit_filter_type(type)))
+	if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
 		return NULL;
 
 	if (gfp_mask & __GFP_DIRECT_RECLAIM) {
diff --git a/kernel/audit.h b/kernel/audit.h
index cbbe6bb6496e..1879f02cb2c3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -327,6 +327,8 @@ extern pid_t audit_sig_pid;
 extern kuid_t audit_sig_uid;
 extern u32 audit_sig_sid;
 
+extern int audit_filter(int msgtype, unsigned int listtype);
+
 #ifdef CONFIG_AUDITSYSCALL
 extern int __audit_signal_info(int sig, struct task_struct *t);
 static inline int audit_signal_info(int sig, struct task_struct *t)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ff59a5eed691..85d9cac497e4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1290,117 +1290,72 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
 	return strncmp(p, dname, dlen);
 }
 
-static int audit_filter_user_rules(struct audit_krule *rule, int type,
-				   enum audit_state *state)
-{
-	int i;
-
-	for (i = 0; i < rule->field_count; i++) {
-		struct audit_field *f = &rule->fields[i];
-		pid_t pid;
-		int result = 0;
-		u32 sid;
-
-		switch (f->type) {
-		case AUDIT_PID:
-			pid = task_pid_nr(current);
-			result = audit_comparator(pid, f->op, f->val);
-			break;
-		case AUDIT_UID:
-			result = audit_uid_comparator(current_uid(), f->op, f->uid);
-			break;
-		case AUDIT_GID:
-			result = audit_gid_comparator(current_gid(), f->op, f->gid);
-			break;
-		case AUDIT_LOGINUID:
-			result = audit_uid_comparator(audit_get_loginuid(current),
-						  f->op, f->uid);
-			break;
-		case AUDIT_LOGINUID_SET:
-			result = audit_comparator(audit_loginuid_set(current),
-						  f->op, f->val);
-			break;
-		case AUDIT_MSGTYPE:
-			result = audit_comparator(type, f->op, f->val);
-			break;
-		case AUDIT_SUBJ_USER:
-		case AUDIT_SUBJ_ROLE:
-		case AUDIT_SUBJ_TYPE:
-		case AUDIT_SUBJ_SEN:
-		case AUDIT_SUBJ_CLR:
-			if (f->lsm_rule) {
-				security_task_getsecid(current, &sid);
-				result = security_audit_rule_match(sid,
-								   f->type,
-								   f->op,
-								   f->lsm_rule,
-								   NULL);
-			}
-			break;
-		}
-
-		if (result <= 0)
-			return result;
-	}
-	switch (rule->action) {
-	case AUDIT_NEVER:
-		*state = AUDIT_DISABLED;
-		break;
-	case AUDIT_ALWAYS:
-		*state = AUDIT_RECORD_CONTEXT;
-		break;
-	}
-	return 1;
-}
-
-int audit_filter_user(int type)
-{
-	enum audit_state state = AUDIT_DISABLED;
-	struct audit_entry *e;
-	int rc, ret;
-
-	ret = 1; /* Audit by default */
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
-		rc = audit_filter_user_rules(&e->rule, type, &state);
-		if (rc) {
-			if (rc > 0 && state == AUDIT_DISABLED)
-				ret = 0;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	return ret;
-}
-
-int audit_filter_type(int type)
+int audit_filter(int msgtype, unsigned int listtype)
 {
 	struct audit_entry *e;
-	int result = 0;
+	int ret = 1; /* Audit by default */
 
 	rcu_read_lock();
-	if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+	if (list_empty(&audit_filter_list[listtype]))
 		goto unlock_and_return;
+	list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {
+		int i, result = 0;
 
-	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
-				list) {
-		int i;
 		for (i = 0; i < e->rule.field_count; i++) {
 			struct audit_field *f = &e->rule.fields[i];
-			if (f->type == AUDIT_MSGTYPE) {
-				result = audit_comparator(type, f->op, f->val);
-				if (!result)
-					break;
+			pid_t pid;
+			u32 sid;
+
+			switch (f->type) {
+			case AUDIT_PID:
+				pid = task_pid_nr(current);
+				result = audit_comparator(pid, f->op, f->val);
+				break;
+			case AUDIT_UID:
+				result = audit_uid_comparator(current_uid(), f->op, f->uid);
+				break;
+			case AUDIT_GID:
+				result = audit_gid_comparator(current_gid(), f->op, f->gid);
+				break;
+			case AUDIT_LOGINUID:
+				result = audit_uid_comparator(audit_get_loginuid(current),
+							      f->op, f->uid);
+				break;
+			case AUDIT_LOGINUID_SET:
+				result = audit_comparator(audit_loginuid_set(current),
+							  f->op, f->val);
+				break;
+			case AUDIT_MSGTYPE:
+				result = audit_comparator(msgtype, f->op, f->val);
+				break;
+			case AUDIT_SUBJ_USER:
+			case AUDIT_SUBJ_ROLE:
+			case AUDIT_SUBJ_TYPE:
+			case AUDIT_SUBJ_SEN:
+			case AUDIT_SUBJ_CLR:
+				if (f->lsm_rule) {
+					security_task_getsecid(current, &sid);
+					result = security_audit_rule_match(sid,
+							f->type, f->op, f->lsm_rule, NULL);
+				}
+				break;
+			default:
+				goto unlock_and_return;
 			}
+			if (result < 0) /* error */
+				goto unlock_and_return;
+			if (!result)
+				break;
+		}
+		if (result > 0) {
+			if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE)
+				ret = 0;
+			break;
 		}
-		if (result)
-			goto unlock_and_return;
 	}
 unlock_and_return:
 	rcu_read_unlock();
-	return result;
+	return ret;
 }
 
 static int update_lsm_rule(struct audit_krule *r)
-- 
cgit v1.2.3


From 7fa8b7171a638ad896acabd9a17183b75b70aeb4 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <agnel.joel@gmail.com>
Date: Fri, 17 Jun 2016 22:44:54 -0700
Subject: tracing/function_graph: Fix filters for function_graph threshold

Function graph tracer currently ignores filters if tracing_thresh is set.
For example, even if set_ftrace_pid is set, then its ignored if tracing_thresh
set, resulting in all processes being traced.

To fix this, we reuse the same entry function as when tracing_thresh is not
set and do everything as in the regular case except for writing the function entry
to the ring buffer.

Link: http://lkml.kernel.org/r/1466228694-2677-1-git-send-email-agnel.joel@gmail.com

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Joel Fernandes <agnel.joel@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 67cce7896aeb..7363ccf79512 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	if (ftrace_graph_notrace_addr(trace->func))
 		return 1;
 
+	/*
+	 * Stop here if tracing_threshold is set. We only write function return
+	 * events to the ring buffer.
+	 */
+	if (tracing_thresh)
+		return 1;
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	return ret;
 }
 
-static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
-{
-	if (tracing_thresh)
-		return 1;
-	else
-		return trace_graph_entry(trace);
-}
-
 static void
 __trace_graph_function(struct trace_array *tr,
 		unsigned long ip, unsigned long flags, int pc)
@@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr)
 	set_graph_array(tr);
 	if (tracing_thresh)
 		ret = register_ftrace_graph(&trace_graph_thresh_return,
-					    &trace_graph_thresh_entry);
+					    &trace_graph_entry);
 	else
 		ret = register_ftrace_graph(&trace_graph_return,
 					    &trace_graph_entry);
-- 
cgit v1.2.3


From ea00f4f4f00cc2bc3b63ad512a4e6df3b20832b9 Mon Sep 17 00:00:00 2001
From: Lianwei Wang <lianwei.wang@gmail.com>
Date: Sun, 19 Jun 2016 23:52:27 -0700
Subject: PM / sleep: make PM notifiers called symmetrically

This makes pm notifier PREPARE/POST symmetrical: if PREPARE
fails, we will only undo what ever happened on PREPARE.

It fixes the unbalanced CPU hotplug enable in CPU PM notifier.

Signed-off-by: Lianwei Wang <lianwei.wang@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 20 ++++++++++++--------
 kernel/power/main.c      | 11 +++++++++--
 kernel/power/power.h     |  2 ++
 kernel/power/suspend.c   | 10 ++++++----
 kernel/power/user.c      | 14 ++++++++------
 5 files changed, 37 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fca9254280ee..126e24caa82e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -647,7 +647,7 @@ static void power_down(void)
  */
 int hibernate(void)
 {
-	int error;
+	int error, nr_calls = 0;
 
 	if (!hibernation_available()) {
 		pr_debug("PM: Hibernation not available.\n");
@@ -662,9 +662,11 @@ int hibernate(void)
 	}
 
 	pm_prepare_console();
-	error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
-	if (error)
+	error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+	if (error) {
+		nr_calls--;
 		goto Exit;
+	}
 
 	printk(KERN_INFO "PM: Syncing filesystems ... ");
 	sys_sync();
@@ -714,7 +716,7 @@ int hibernate(void)
 	/* Don't bother checking whether freezer_test_done is true */
 	freezer_test_done = false;
  Exit:
-	pm_notifier_call_chain(PM_POST_HIBERNATION);
+	__pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
 	pm_restore_console();
 	atomic_inc(&snapshot_device_available);
  Unlock:
@@ -740,7 +742,7 @@ int hibernate(void)
  */
 static int software_resume(void)
 {
-	int error;
+	int error, nr_calls = 0;
 	unsigned int flags;
 
 	/*
@@ -827,9 +829,11 @@ static int software_resume(void)
 	}
 
 	pm_prepare_console();
-	error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
-	if (error)
+	error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
+	if (error) {
+		nr_calls--;
 		goto Close_Finish;
+	}
 
 	pr_debug("PM: Preparing processes for restore.\n");
 	error = freeze_processes();
@@ -855,7 +859,7 @@ static int software_resume(void)
 	unlock_device_hotplug();
 	thaw_processes();
  Finish:
-	pm_notifier_call_chain(PM_POST_RESTORE);
+	__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
 	pm_restore_console();
 	atomic_inc(&snapshot_device_available);
 	/* For success case, the suspend path will release the lock */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 27946975eff0..5ea50b1b7595 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_pm_notifier);
 
-int pm_notifier_call_chain(unsigned long val)
+int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
 {
-	int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+	int ret;
+
+	ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
+						nr_to_call, nr_calls);
 
 	return notifier_to_errno(ret);
 }
+int pm_notifier_call_chain(unsigned long val)
+{
+	return __pm_notifier_call_chain(val, -1, NULL);
+}
 
 /* If set, devices may be suspended and resumed asynchronously. */
 int pm_async_enabled = 1;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index efe1b3b17c88..51f02ecaf125 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -200,6 +200,8 @@ static inline void suspend_test_finish(const char *label) {}
 
 #ifdef CONFIG_PM_SLEEP
 /* kernel/power/main.c */
+extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
+				    int *nr_calls);
 extern int pm_notifier_call_chain(unsigned long val);
 #endif
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 5b70d64b871e..0acab9d7f96f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -266,16 +266,18 @@ static int suspend_test(int level)
  */
 static int suspend_prepare(suspend_state_t state)
 {
-	int error;
+	int error, nr_calls = 0;
 
 	if (!sleep_state_supported(state))
 		return -EPERM;
 
 	pm_prepare_console();
 
-	error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
-	if (error)
+	error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
+	if (error) {
+		nr_calls--;
 		goto Finish;
+	}
 
 	trace_suspend_resume(TPS("freeze_processes"), 0, true);
 	error = suspend_freeze_processes();
@@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state)
 	suspend_stats.failed_freeze++;
 	dpm_save_failed_step(SUSPEND_FREEZE);
  Finish:
-	pm_notifier_call_chain(PM_POST_SUSPEND);
+	__pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
 	pm_restore_console();
 	return error;
 }
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 526e8911460a..35310b627388 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1);
 static int snapshot_open(struct inode *inode, struct file *filp)
 {
 	struct snapshot_data *data;
-	int error;
+	int error, nr_calls = 0;
 
 	if (!hibernation_available())
 		return -EPERM;
@@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 			swap_type_of(swsusp_resume_device, 0, NULL) : -1;
 		data->mode = O_RDONLY;
 		data->free_bitmaps = false;
-		error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+		error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
 		if (error)
-			pm_notifier_call_chain(PM_POST_HIBERNATION);
+			__pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
 	} else {
 		/*
 		 * Resuming.  We may need to wait for the image device to
@@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 
 		data->swap = -1;
 		data->mode = O_WRONLY;
-		error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+		error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
 		if (!error) {
 			error = create_basic_memory_bitmaps();
 			data->free_bitmaps = !error;
-		}
+		} else
+			nr_calls--;
+
 		if (error)
-			pm_notifier_call_chain(PM_POST_RESTORE);
+			__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
 	}
 	if (error)
 		atomic_inc(&snapshot_device_available);
-- 
cgit v1.2.3


From 1ca1cc98bf7418c680415bfce05699f67510a7fd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:23 +0200
Subject: bpf: minor cleanups on fd maps and helpers

Some minor cleanups: i) Remove the unlikely() from fd array map lookups
and let the CPU branch predictor do its job, scenarios where there is not
always a map entry are very well valid. ii) Move the attribute type check
in the bpf_perf_event_read() helper a bit earlier so it's consistent wrt
checks with bpf_perf_event_output() helper as well. iii) remove some
comments that are self-documenting in kprobe_prog_is_valid_access() and
therefore make it consistent to tp_prog_is_valid_access() as well.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c        |  3 +--
 kernel/trace/bpf_trace.c | 18 ++++++------------
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b94a36550591..d638062f66d6 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -719,14 +719,13 @@ select_insn:
 
 		if (unlikely(index >= array->map.max_entries))
 			goto out;
-
 		if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
 			goto out;
 
 		tail_call_cnt++;
 
 		prog = READ_ONCE(array->ptrs[index]);
-		if (unlikely(!prog))
+		if (!prog)
 			goto out;
 
 		/* ARG1 at this point is guaranteed to point to CTX from
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3de25fbed785..4e61f74a5d73 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -199,19 +199,19 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 		return -E2BIG;
 
 	ee = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!ee))
+	if (!ee)
 		return -ENOENT;
 
 	event = ee->event;
+	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
+		     event->attr.type != PERF_TYPE_RAW))
+		return -EINVAL;
+
 	/* make sure event is local and doesn't have pmu::count */
 	if (event->oncpu != smp_processor_id() ||
 	    event->pmu->count)
 		return -EINVAL;
 
-	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
-		     event->attr.type != PERF_TYPE_RAW))
-		return -EINVAL;
-
 	/*
 	 * we don't know if the function is run successfully by the
 	 * return value. It can be judged in other places, such as
@@ -251,7 +251,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 		return -E2BIG;
 
 	ee = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!ee))
+	if (!ee)
 		return -ENOENT;
 
 	event = ee->event;
@@ -354,18 +354,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 					enum bpf_reg_type *reg_type)
 {
-	/* check bounds */
 	if (off < 0 || off >= sizeof(struct pt_regs))
 		return false;
-
-	/* only read is allowed */
 	if (type != BPF_READ)
 		return false;
-
-	/* disallow misaligned access */
 	if (off % size != 0)
 		return false;
-
 	return true;
 }
 
-- 
cgit v1.2.3


From d79313303181d357d293453fb8486bdc87bfd2f4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:24 +0200
Subject: bpf, trace: fetch current cpu only once

We currently have two invocations, which is unnecessary. Fetch it only
once and use the smp_processor_id() variant, so we also get preemption
checks along with it when DEBUG_PREEMPT is set.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4e61f74a5d73..505f9e9cdb3b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -233,6 +233,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
 	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
@@ -246,7 +247,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
 	if (index == BPF_F_CURRENT_CPU)
-		index = raw_smp_processor_id();
+		index = cpu;
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
@@ -259,7 +260,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
 		return -EINVAL;
 
-	if (unlikely(event->oncpu != smp_processor_id()))
+	if (unlikely(event->oncpu != cpu))
 		return -EOPNOTSUPP;
 
 	perf_sample_data_init(&sample_data, 0, 0);
-- 
cgit v1.2.3


From 6816a7ffce32e999601825ddfd887f36d3052932 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:25 +0200
Subject: bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_read

Follow-up commit to 1e33759c788c ("bpf, trace: add BPF_F_CURRENT_CPU
flag for bpf_perf_event_output") to add the same functionality into
bpf_perf_event_read() helper. The split of index into flags and index
component is also safe here, since such large maps are rejected during
map allocation time.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 505f9e9cdb3b..19c5b4a5c3eb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -188,13 +188,19 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
-static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
 {
 	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	unsigned int cpu = smp_processor_id();
+	u64 index = flags & BPF_F_INDEX_MASK;
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
 
+	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+		return -EINVAL;
+	if (index == BPF_F_CURRENT_CPU)
+		index = cpu;
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
@@ -208,8 +214,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 		return -EINVAL;
 
 	/* make sure event is local and doesn't have pmu::count */
-	if (event->oncpu != smp_processor_id() ||
-	    event->pmu->count)
+	if (unlikely(event->oncpu != cpu || event->pmu->count))
 		return -EINVAL;
 
 	/*
-- 
cgit v1.2.3


From 80b48c445797a634d869c7e5a53e182ba2688931 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:26 +0200
Subject: bpf: don't use raw processor id in generic helper

Use smp_processor_id() for the generic helper bpf_get_smp_processor_id()
instead of the raw variant. This allows for preemption checks when we
have DEBUG_PREEMPT, and otherwise uses the raw variant anyway. We only
need to keep the raw variant for socket filters, but we can reuse the
helper that is already there from cBPF side.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ad7a0573f71b..1ea3afba1a4f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 
 static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
-	return raw_smp_processor_id();
+	return smp_processor_id();
 }
 
 const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
-- 
cgit v1.2.3


From 5f65e5ca286126a60f62c8421b77c2018a482b8a Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 26 Apr 2016 14:36:24 -0500
Subject: cred: Reject inodes with invalid ids in set_create_file_as()

Using INVALID_[UG]ID for the LSM file creation context doesn't
make sense, so return an error if the inode passed to
set_create_file_as() has an invalid id.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/cred.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 0c0cd8a62285..5f264fb5737d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx);
  */
 int set_create_files_as(struct cred *new, struct inode *inode)
 {
+	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+		return -EINVAL;
 	new->fsuid = inode->i_uid;
 	new->fsgid = inode->i_gid;
 	return security_kernel_create_files_as(new, inode);
-- 
cgit v1.2.3


From 1aacde3d22c42281236155c1ef6d7a5aa32a826b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 30 Jun 2016 17:24:43 +0200
Subject: bpf: generally move prog destruction to RCU deferral

Jann Horn reported following analysis that could potentially result
in a very hard to trigger (if not impossible) UAF race, to quote his
event timeline:

 - Set up a process with threads T1, T2 and T3
 - Let T1 set up a socket filter F1 that invokes another filter F2
   through a BPF map [tail call]
 - Let T1 trigger the socket filter via a unix domain socket write,
   don't wait for completion
 - Let T2 call PERF_EVENT_IOC_SET_BPF with F2, don't wait for completion
 - Now T2 should be behind bpf_prog_get(), but before bpf_prog_put()
 - Let T3 close the file descriptor for F2, dropping the reference
   count of F2 to 2
 - At this point, T1 should have looked up F2 from the map, but not
   finished executing it
 - Let T3 remove F2 from the BPF map, dropping the reference count of
   F2 to 1
 - Now T2 should call bpf_prog_put() (wrong BPF program type), dropping
   the reference count of F2 to 0 and scheduling bpf_prog_free_deferred()
   via schedule_work()
 - At this point, the BPF program could be freed
 - BPF execution is still running in a freed BPF program

While at PERF_EVENT_IOC_SET_BPF time it's only guaranteed that the perf
event fd we're doing the syscall on doesn't disappear from underneath us
for whole syscall time, it may not be the case for the bpf fd used as
an argument only after we did the put. It needs to be a valid fd pointing
to a BPF program at the time of the call to make the bpf_prog_get() and
while T2 gets preempted, F2 must have dropped reference to 1 on the other
CPU. The fput() from the close() in T3 should also add additionally delay
to the reference drop via exit_task_work() when bpf_prog_release() gets
called as well as scheduling bpf_prog_free_deferred().

That said, it makes nevertheless sense to move the BPF prog destruction
generally after RCU grace period to guarantee that such scenario above,
but also others as recently fixed in ceb56070359b ("bpf, perf: delay release
of BPF prog after grace period") with regards to tail calls won't happen.
Integrating bpf_prog_free_deferred() directly into the RCU callback is
not allowed since the invocation might happen from either softirq or
process context, so we're not permitted to block. Reviewing all bpf_prog_put()
invocations from eBPF side (note, cBPF -> eBPF progs don't use this for
their destruction) with call_rcu() look good to me.

Since we don't know whether at the time of attaching the program, we're
already part of a tail call map, we need to use RCU variant. However, due
to this, there won't be severely more stress on the RCU callback queue:
situations with above bpf_prog_get() and bpf_prog_put() combo in practice
normally won't lead to releases, but even if they would, enough effort/
cycles have to be put into loading a BPF program into the kernel already.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c |  4 +---
 kernel/bpf/syscall.c  | 13 +++----------
 kernel/events/core.c  |  2 +-
 3 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 5af30732697b..4ec57a649b1f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -390,9 +390,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
 
 static void prog_fd_array_put_ptr(void *ptr)
 {
-	struct bpf_prog *prog = ptr;
-
-	bpf_prog_put_rcu(prog);
+	bpf_prog_put(ptr);
 }
 
 /* decrement refcnt of all bpf_progs that are stored in this map */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c23a4e9311b3..f6806a1d7ed9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -623,7 +623,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
 	free_uid(user);
 }
 
-static void __prog_put_common(struct rcu_head *rcu)
+static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 {
 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
 
@@ -632,17 +632,10 @@ static void __prog_put_common(struct rcu_head *rcu)
 	bpf_prog_free(aux->prog);
 }
 
-/* version of bpf_prog_put() that is called after a grace period */
-void bpf_prog_put_rcu(struct bpf_prog *prog)
-{
-	if (atomic_dec_and_test(&prog->aux->refcnt))
-		call_rcu(&prog->aux->rcu, __prog_put_common);
-}
-
 void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt))
-		__prog_put_common(&prog->aux->rcu);
+		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_put);
 
@@ -650,7 +643,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_prog *prog = filp->private_data;
 
-	bpf_prog_put_rcu(prog);
+	bpf_prog_put(prog);
 	return 0;
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 85cd41878a74..9c51ec3f0f44 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7529,7 +7529,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
 	prog = event->tp_event->prog;
 	if (prog) {
 		event->tp_event->prog = NULL;
-		bpf_prog_put_rcu(prog);
+		bpf_prog_put(prog);
 	}
 }
 
-- 
cgit v1.2.3


From 113214be7f6c98dd6d0435e4765aea8dea91662c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 30 Jun 2016 17:24:44 +0200
Subject: bpf: refactor bpf_prog_get and type check into helper

Since bpf_prog_get() and program type check is used in a couple of places,
refactor this into a small helper function that we can make use of. Since
the non RO prog->aux part is not used in performance critical paths and a
program destruction via RCU is rather very unlikley when doing the put, we
shouldn't have an issue just doing the bpf_prog_get() + prog->type != type
check, but actually not taking the ref at all (due to being in fdget() /
fdput() section of the bpf fd) is even cleaner and makes the diff smaller
as well, so just go for that. Callsites are changed to make use of the new
helper where possible.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f6806a1d7ed9..22863d9872b1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -657,7 +657,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
 				O_RDWR | O_CLOEXEC);
 }
 
-static struct bpf_prog *__bpf_prog_get(struct fd f)
+static struct bpf_prog *____bpf_prog_get(struct fd f)
 {
 	if (!f.file)
 		return ERR_PTR(-EBADF);
@@ -678,24 +678,35 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 	return prog;
 }
 
-/* called by sockets/tracing/seccomp before attaching program to an event
- * pairs with bpf_prog_put()
- */
-struct bpf_prog *bpf_prog_get(u32 ufd)
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
 
-	prog = __bpf_prog_get(f);
+	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
+	if (type && prog->type != *type) {
+		prog = ERR_PTR(-EINVAL);
+		goto out;
+	}
 
 	prog = bpf_prog_inc(prog);
+out:
 	fdput(f);
-
 	return prog;
 }
-EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	return __bpf_prog_get(ufd, NULL);
+}
+
+struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+{
+	return __bpf_prog_get(ufd, &type);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD kern_version
-- 
cgit v1.2.3


From 1f3fe7ebf6136c341012db9f554d4caa566fcbaa Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 30 Jun 2016 10:28:42 -0700
Subject: cgroup: Add cgroup_get_from_fd

Add a helper function to get a cgroup2 from a fd.  It will be
stored in a bpf array (BPF_MAP_TYPE_CGROUP_ARRAY) which will
be introduced in the later patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/cgroup.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75c0ff00aca6..50787cd61da2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -62,6 +62,7 @@
 #include <linux/proc_ns.h>
 #include <linux/nsproxy.h>
 #include <linux/proc_ns.h>
+#include <linux/file.h>
 #include <net/sock.h>
 
 /*
@@ -6209,6 +6210,40 @@ struct cgroup *cgroup_get_from_path(const char *path)
 }
 EXPORT_SYMBOL_GPL(cgroup_get_from_path);
 
+/**
+ * cgroup_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup2_dir)
+ *
+ * Find the cgroup from a fd which should be obtained
+ * by opening a cgroup directory.  Returns a pointer to the
+ * cgroup on success. ERR_PTR is returned if the cgroup
+ * cannot be found.
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup *cgrp;
+	struct file *f;
+
+	f = fget_raw(fd);
+	if (!f)
+		return ERR_PTR(-EBADF);
+
+	css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+	fput(f);
+	if (IS_ERR(css))
+		return ERR_CAST(css);
+
+	cgrp = css->cgroup;
+	if (!cgroup_on_dfl(cgrp)) {
+		cgroup_put(cgrp);
+		return ERR_PTR(-EBADF);
+	}
+
+	return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+
 /*
  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
  * definition in cgroup-defs.h.
-- 
cgit v1.2.3


From 4ed8ec521ed57c4e207ad464ca0388776de74d4b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 30 Jun 2016 10:28:43 -0700
Subject: cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY

Add a BPF_MAP_TYPE_CGROUP_ARRAY and its bpf_map_ops's implementations.
To update an element, the caller is expected to obtain a cgroup2 backed
fd by open(cgroup2_dir) and then update the array with that fd.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c  |  3 ++-
 kernel/bpf/verifier.c |  2 ++
 3 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 4ec57a649b1f..db1a743e3db2 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -537,3 +537,46 @@ static int __init register_perf_event_array_map(void)
 	return 0;
 }
 late_initcall(register_perf_event_array_map);
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
+				     struct file *map_file /* not used */,
+				     int fd)
+{
+	return cgroup_get_from_fd(fd);
+}
+
+static void cgroup_fd_array_put_ptr(void *ptr)
+{
+	/* cgroup_put free cgrp after a rcu grace period */
+	cgroup_put(ptr);
+}
+
+static void cgroup_fd_array_free(struct bpf_map *map)
+{
+	bpf_fd_array_map_clear(map);
+	fd_array_map_free(map);
+}
+
+static const struct bpf_map_ops cgroup_array_ops = {
+	.map_alloc = fd_array_map_alloc,
+	.map_free = cgroup_fd_array_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = cgroup_fd_array_get_ptr,
+	.map_fd_put_ptr = cgroup_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+	.ops = &cgroup_array_ops,
+	.type = BPF_MAP_TYPE_CGROUP_ARRAY,
+};
+
+static int __init register_cgroup_array_map(void)
+{
+	bpf_register_map_type(&cgroup_array_type);
+	return 0;
+}
+late_initcall(register_cgroup_array_map);
+#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 22863d9872b1..96d938a22050 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -393,7 +393,8 @@ static int map_update_elem(union bpf_attr *attr)
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
-		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+		   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
 		rcu_read_lock();
 		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
 						   attr->flags);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eec9f90ba030..69ba2251a22b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1035,6 +1035,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (func_id != BPF_FUNC_get_stackid)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_CGROUP_ARRAY:
+		goto error;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 4a482f34afcc162d8456f449b137ec2a95be60d8 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 30 Jun 2016 10:28:44 -0700
Subject: cgroup: bpf: Add bpf_skb_in_cgroup_proto

Adds a bpf helper, bpf_skb_in_cgroup, to decide if a skb->sk
belongs to a descendant of a cgroup2.  It is similar to the
feature added in netfilter:
commit c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")

The user is expected to populate a BPF_MAP_TYPE_CGROUP_ARRAY
which will be used by the bpf_skb_in_cgroup.

Modifications to the bpf verifier is to ensure BPF_MAP_TYPE_CGROUP_ARRAY
and bpf_skb_in_cgroup() are always used together.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 69ba2251a22b..e206c2181412 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1036,7 +1036,9 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_CGROUP_ARRAY:
-		goto error;
+		if (func_id != BPF_FUNC_skb_in_cgroup)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -1056,6 +1058,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
 			goto error;
 		break;
+	case BPF_FUNC_skb_in_cgroup:
+		if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
+			goto error;
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 7b776af66dc462caa7e839cc5c950a61db1f8551 Mon Sep 17 00:00:00 2001
From: Roger Lu <roger.lu@mediatek.com>
Date: Fri, 1 Jul 2016 11:05:02 +0800
Subject: PM / suspend: show workqueue state in suspend flow

If freezable workqueue aborts suspend flow, show
workqueue state for debug purpose.

Signed-off-by: Roger Lu <roger.lu@mediatek.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/process.c | 3 +++
 kernel/workqueue.c     | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index df058bed53ce..6eef250a5705 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only)
 		       elapsed_msecs / 1000, elapsed_msecs % 1000,
 		       todo - wq_busy, wq_busy);
 
+		if (wq_busy)
+			show_workqueue_state();
+
 		if (!wakeup) {
 			read_lock(&tasklist_lock);
 			for_each_process_thread(g, p) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e1c0e996b5ae..619e80ce4a59 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4369,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq)
 /**
  * show_workqueue_state - dump workqueue state
  *
- * Called from a sysrq handler and prints out all busy workqueues and
- * pools.
+ * Called from a sysrq handler or try_to_freeze_tasks() and prints out
+ * all busy workqueues and pools.
  */
 void show_workqueue_state(void)
 {
-- 
cgit v1.2.3


From 9c744481c003697de453e8fc039468143ba604aa Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 29 Jun 2016 03:00:51 +0200
Subject: PM / hibernate: Do not free preallocated safe pages during image
 restore

The core image restoration code preallocates some safe pages
(ie. pages that weren't used by the image kernel before hibernation)
for future use before allocating the bulk of memory for loading the
image data.  Those safe pages are then freed so they can be allocated
again (with the memory management subsystem's help).  That's done to
ensure that there will be enough safe pages for temporary data
structures needed during image restoration.

However, it is not really necessary to free those pages after they
have been allocated.  They can be added to the (global) list of
safe pages right away and then picked up from there when needed
without freeing.

That reduces the overhead related to using safe pages, especially
in the arch-specific code, so modify the code accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 66 ++++++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3a970604308f..d9476ff877b8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -74,6 +74,22 @@ void __init hibernate_image_size_init(void)
  */
 struct pbe *restore_pblist;
 
+/* struct linked_page is used to build chains of pages */
+
+#define LINKED_PAGE_DATA_SIZE	(PAGE_SIZE - sizeof(void *))
+
+struct linked_page {
+	struct linked_page *next;
+	char data[LINKED_PAGE_DATA_SIZE];
+} __packed;
+
+/*
+ * List of "safe" pages (ie. pages that were not used by the image kernel
+ * before hibernation) that may be used as temporary storage for image kernel
+ * memory contents.
+ */
+static struct linked_page *safe_pages_list;
+
 /* Pointer to an auxiliary buffer (1 page) */
 static void *buffer;
 
@@ -113,9 +129,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 	return res;
 }
 
+static void *__get_safe_page(gfp_t gfp_mask)
+{
+	if (safe_pages_list) {
+		void *ret = safe_pages_list;
+
+		safe_pages_list = safe_pages_list->next;
+		memset(ret, 0, PAGE_SIZE);
+		return ret;
+	}
+	return get_image_page(gfp_mask, PG_SAFE);
+}
+
 unsigned long get_safe_page(gfp_t gfp_mask)
 {
-	return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+	return (unsigned long)__get_safe_page(gfp_mask);
 }
 
 static struct page *alloc_image_page(gfp_t gfp_mask)
@@ -150,15 +178,6 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
 	__free_page(page);
 }
 
-/* struct linked_page is used to build chains of pages */
-
-#define LINKED_PAGE_DATA_SIZE	(PAGE_SIZE - sizeof(void *))
-
-struct linked_page {
-	struct linked_page *next;
-	char data[LINKED_PAGE_DATA_SIZE];
-} __packed;
-
 static inline void
 free_list_of_pages(struct linked_page *list, int clear_page_nosave)
 {
@@ -208,7 +227,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 	if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
 		struct linked_page *lp;
 
-		lp = get_image_page(ca->gfp_mask, ca->safe_needed);
+		lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) :
+					get_image_page(ca->gfp_mask, PG_ANY);
 		if (!lp)
 			return NULL;
 
@@ -2104,11 +2124,6 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 	return 0;
 }
 
-/* List of "safe" pages that may be used to store data loaded from the suspend
- * image
- */
-static struct linked_page *safe_pages_list;
-
 #ifdef CONFIG_HIGHMEM
 /* struct highmem_pbe is used for creating the list of highmem pages that
  * should be restored atomically during the resume from disk, because the page
@@ -2334,7 +2349,7 @@ static int
 prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
 	unsigned int nr_pages, nr_highmem;
-	struct linked_page *sp_list, *lp;
+	struct linked_page *lp;
 	int error;
 
 	/* If there is no highmem, the buffer will not be necessary */
@@ -2362,9 +2377,9 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 	 * NOTE: This way we make sure there will be enough safe pages for the
 	 * chain_alloc() in get_buffer().  It is a bit wasteful, but
 	 * nr_copy_pages cannot be greater than 50% of the memory anyway.
+	 *
+	 * nr_copy_pages cannot be less than allocated_unsafe_pages too.
 	 */
-	sp_list = NULL;
-	/* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
 	nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
 	nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
 	while (nr_pages > 0) {
@@ -2373,12 +2388,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 			error = -ENOMEM;
 			goto Free;
 		}
-		lp->next = sp_list;
-		sp_list = lp;
+		lp->next = safe_pages_list;
+		safe_pages_list = lp;
 		nr_pages--;
 	}
 	/* Preallocate memory for the image */
-	safe_pages_list = NULL;
 	nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
 	while (nr_pages > 0) {
 		lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
@@ -2396,12 +2410,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 		swsusp_set_page_free(virt_to_page(lp));
 		nr_pages--;
 	}
-	/* Free the reserved safe pages so that chain_alloc() can use them */
-	while (sp_list) {
-		lp = sp_list->next;
-		free_image_page(sp_list, PG_UNSAFE_CLEAR);
-		sp_list = lp;
-	}
 	return 0;
 
  Free:
@@ -2491,6 +2499,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
 		if (error)
 			return error;
 
+		safe_pages_list = NULL;
+
 		error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
 		if (error)
 			return error;
-- 
cgit v1.2.3


From 6dbecfd345a617888da370b13d5b190c9ff3df53 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 29 Jun 2016 03:02:16 +0200
Subject: PM / hibernate: Simplify mark_unsafe_pages()

Rework mark_unsafe_pages() to use a simpler method of clearing
all bits in free_pages_map and to set the bits for the "unsafe"
pages (ie. pages that were used by the image kernel before
hibernation) with the help of duplicate_memory_bitmap().

For this purpose, move the pfn_valid() check from mark_unsafe_pages()
to unpack_orig_pfns() where the "unsafe" pages are discovered.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 64 +++++++++++++++++++------------------------------
 1 file changed, 25 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d9476ff877b8..39bbad5fac5a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2019,53 +2019,41 @@ int snapshot_read_next(struct snapshot_handle *handle)
 	return PAGE_SIZE;
 }
 
+static void duplicate_memory_bitmap(struct memory_bitmap *dst,
+				    struct memory_bitmap *src)
+{
+	unsigned long pfn;
+
+	memory_bm_position_reset(src);
+	pfn = memory_bm_next_pfn(src);
+	while (pfn != BM_END_OF_MAP) {
+		memory_bm_set_bit(dst, pfn);
+		pfn = memory_bm_next_pfn(src);
+	}
+}
+
 /**
  *	mark_unsafe_pages - mark the pages that cannot be used for storing
  *	the image during resume, because they conflict with the pages that
  *	had been used before suspend
  */
 
-static int mark_unsafe_pages(struct memory_bitmap *bm)
+static void mark_unsafe_pages(struct memory_bitmap *bm)
 {
-	struct zone *zone;
-	unsigned long pfn, max_zone_pfn;
+	unsigned long pfn;
 
-	/* Clear page flags */
-	for_each_populated_zone(zone) {
-		max_zone_pfn = zone_end_pfn(zone);
-		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-			if (pfn_valid(pfn))
-				swsusp_unset_page_free(pfn_to_page(pfn));
+	/* Clear the "free"/"unsafe" bit for all PFNs */
+	memory_bm_position_reset(free_pages_map);
+	pfn = memory_bm_next_pfn(free_pages_map);
+	while (pfn != BM_END_OF_MAP) {
+		memory_bm_clear_current(free_pages_map);
+		pfn = memory_bm_next_pfn(free_pages_map);
 	}
 
-	/* Mark pages that correspond to the "original" pfns as "unsafe" */
-	memory_bm_position_reset(bm);
-	do {
-		pfn = memory_bm_next_pfn(bm);
-		if (likely(pfn != BM_END_OF_MAP)) {
-			if (likely(pfn_valid(pfn)))
-				swsusp_set_page_free(pfn_to_page(pfn));
-			else
-				return -EFAULT;
-		}
-	} while (pfn != BM_END_OF_MAP);
+	/* Mark pages that correspond to the "original" PFNs as "unsafe" */
+	duplicate_memory_bitmap(free_pages_map, bm);
 
 	allocated_unsafe_pages = 0;
-
-	return 0;
-}
-
-static void
-duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
-{
-	unsigned long pfn;
-
-	memory_bm_position_reset(src);
-	pfn = memory_bm_next_pfn(src);
-	while (pfn != BM_END_OF_MAP) {
-		memory_bm_set_bit(dst, pfn);
-		pfn = memory_bm_next_pfn(src);
-	}
 }
 
 static int check_header(struct swsusp_info *info)
@@ -2115,7 +2103,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 		/* Extract and buffer page key for data page (s390 only). */
 		page_key_memorize(buf + j);
 
-		if (memory_bm_pfn_present(bm, buf[j]))
+		if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
 			memory_bm_set_bit(bm, buf[j]);
 		else
 			return -EFAULT;
@@ -2357,9 +2345,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 	buffer = NULL;
 
 	nr_highmem = count_highmem_image_pages(bm);
-	error = mark_unsafe_pages(bm);
-	if (error)
-		goto Free;
+	mark_unsafe_pages(bm);
 
 	error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
 	if (error)
-- 
cgit v1.2.3


From 307c5971c972ef2bfd541d2850b36a692c6354c9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 29 Jun 2016 03:05:10 +0200
Subject: PM / hibernate: Recycle safe pages after image restoration

One of the memory bitmaps used by the hibernation image restoration
code is freed after the image has been loaded.

That is not quite efficient, though, because the memory pages used
for building that bitmap are known to be safe (ie. they were not
used by the image kernel before hibernation) and the arch-specific
code finalizing the image restoration may need them.  In that case
it needs to allocate those pages again via the memory management
subsystem, check if they are really safe again by consulting the
other bitmaps and so on.

To avoid that, recycle those pages by putting them into the global
list of known safe pages so that they can be given to the arch code
right away when necessary.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 39bbad5fac5a..94b6fe6c9ae3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -158,6 +158,14 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
 	return page;
 }
 
+static void recycle_safe_page(void *page_address)
+{
+	struct linked_page *lp = page_address;
+
+	lp->next = safe_pages_list;
+	safe_pages_list = lp;
+}
+
 /**
  *	free_image_page - free page represented by @addr, allocated with
  *	get_image_page (page flags set by it must be cleared)
@@ -852,6 +860,34 @@ struct nosave_region {
 
 static LIST_HEAD(nosave_regions);
 
+static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone)
+{
+	struct rtree_node *node;
+
+	list_for_each_entry(node, &zone->nodes, list)
+		recycle_safe_page(node->data);
+
+	list_for_each_entry(node, &zone->leaves, list)
+		recycle_safe_page(node->data);
+}
+
+static void memory_bm_recycle(struct memory_bitmap *bm)
+{
+	struct mem_zone_bm_rtree *zone;
+	struct linked_page *p_list;
+
+	list_for_each_entry(zone, &bm->zones, list)
+		recycle_zone_bm_rtree(zone);
+
+	p_list = bm->p_list;
+	while (p_list) {
+		struct linked_page *lp = p_list;
+
+		p_list = lp->next;
+		recycle_safe_page(lp);
+	}
+}
+
 /**
  *	register_nosave_region - register a range of page frames the contents
  *	of which should not be saved during the suspend (to be used in the early
@@ -2542,9 +2578,9 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
 	/* Restore page key for data page (s390 only). */
 	page_key_write(handle->buffer);
 	page_key_free();
-	/* Free only if we have loaded the image entirely */
+	/* Do that only if we have loaded the image entirely */
 	if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
-		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+		memory_bm_recycle(&orig_bm);
 		free_highmem_data();
 	}
 }
-- 
cgit v1.2.3


From 501c2375253c0795048f48368e0b3e8b2f6646dc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 5 Jul 2016 10:04:34 -0400
Subject: ftrace: Move toplevel init out of ftrace_init_tracefs()

Commit 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the bitmap like events
do") placed ftrace_init_tracefs into the instance creation, and encapsulated
the top level updating with an if conditional, as the top level only gets
updated at boot up. Unfortunately, this triggers section mismatch errors as
the init functions are called from a function that can be called later, and
the section mismatch logic is unaware of the if conditional that would
prevent it from happening at run time.

To make everyone happy, create a separate ftrace_init_tracefs_toplevel()
routine that only gets called by init functions, and this will be what calls
other init functions for the toplevel directory.

Link: http://lkml.kernel.org/r/20160704102139.19cbc0d9@gandalf.local.home

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Reported-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the bitmap like events do")
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 16 ++++++++++------
 kernel/trace/trace.c  |  1 +
 kernel/trace/trace.h  |  3 +++
 3 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8b488f4dd8e8..84752c8e28b5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5539,16 +5539,20 @@ static const struct file_operations ftrace_pid_fops = {
 
 void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
-	/* Only the top level directory has the dyn_tracefs and profile */
-	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
-		ftrace_init_dyn_tracefs(d_tracer);
-		ftrace_profile_tracefs(d_tracer);
-	}
-
 	trace_create_file("set_ftrace_pid", 0644, d_tracer,
 			    tr, &ftrace_pid_fops);
 }
 
+void __init ftrace_init_tracefs_toplevel(struct trace_array *tr,
+					 struct dentry *d_tracer)
+{
+	/* Only the top level directory has the dyn_tracefs and profile */
+	WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
+
+	ftrace_init_dyn_tracefs(d_tracer);
+	ftrace_profile_tracefs(d_tracer);
+}
+
 /**
  * ftrace_kill - kill ftrace
  *
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3d9f31b576f3..5fd53a7847bc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7369,6 +7369,7 @@ static __init int tracer_init_tracefs(void)
 		return 0;
 
 	init_tracer_tracefs(&global_trace, d_tracer);
+	ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
 
 	trace_create_file("tracing_thresh", 0644, d_tracer,
 			&global_trace, &tracing_thresh_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index eaee458755a4..c1de3f493cd3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -857,6 +857,8 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
 void ftrace_reset_array_ops(struct trace_array *tr);
 int using_ftrace_ops_list_func(void);
 void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
+void ftrace_init_tracefs_toplevel(struct trace_array *tr,
+				  struct dentry *d_tracer);
 #else
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
@@ -874,6 +876,7 @@ static inline __init void
 ftrace_init_global_array_ops(struct trace_array *tr) { }
 static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
 static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
 /* ftace_func_t type is not defined, use macro instead of static inline */
 #define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
-- 
cgit v1.2.3


From 67f20b084574def586ecba68508acd5d054ccc88 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 4 Jul 2016 15:10:04 +0000
Subject: tracing: Using for_each_set_bit() to simplify trace_pid_write()

Using for_each_set_bit() to simplify the code.

Link: http://lkml.kernel.org/r/1467645004-11169-1-git-send-email-weiyj_lk@163.com

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5fd53a7847bc..dade4c9559cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -517,13 +517,9 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
 
 	if (filtered_pids) {
 		/* copy the current bits to the new max */
-		pid = find_first_bit(filtered_pids->pids,
-				     filtered_pids->pid_max);
-		while (pid < filtered_pids->pid_max) {
+		for_each_set_bit(pid, filtered_pids->pids,
+				 filtered_pids->pid_max) {
 			set_bit(pid, pid_list->pids);
-			pid = find_next_bit(filtered_pids->pids,
-					    filtered_pids->pid_max,
-					    pid + 1);
 			nr_pids++;
 		}
 	}
-- 
cgit v1.2.3


From 7ad8fb61c4abf589596f0a4da34d987471481569 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Sun, 3 Jul 2016 08:51:34 -0500
Subject: tracing: Have HIST_TRIGGERS select TRACING

The kbuild test robot reported a compile error if HIST_TRIGGERS was
enabled but nothing else that selected TRACING was configured in.

HIST_TRIGGERS should directly select it and not rely on anything else
to do it.

Link: http://lkml.kernel.org/r/57791866.8080505@linux.intel.com

Reported-by: kbuild test robot <fennguang.wu@intel.com>
Fixes: 7ef224d1d0e3a ("tracing: Add 'hist' event trigger command")
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fafeaf803bd0..f4b86e8ca1e7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -542,6 +542,7 @@ config HIST_TRIGGERS
 	bool "Histogram triggers"
 	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select TRACING_MAP
+	select TRACING
 	default n
 	help
 	  Hist triggers allow one or more arbitrary trace event fields
-- 
cgit v1.2.3


From a4a551b8f1d4c4ebffd0f49dfef44df3128546f8 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 29 Jun 2016 19:56:48 +0900
Subject: ftrace: Reduce size of function graph entries

Currently ftrace_graph_ent{,_entry} and ftrace_graph_ret{,_entry} struct
can have padding bytes at the end due to alignment in 64-bit data type.
As these data are recorded so frequently, those paddings waste
non-negligible space.  As the ring buffer maintains alignment properly
for each architecture, just to remove the extra padding using 'packed'
attribute.

  ftrace_graph_ent_entry:  24 -> 20
  ftrace_graph_ret_entry:  48 -> 44

Also I moved the 'overrun' field in struct ftrace_graph_ret to minimize
the padding in the middle.

Tested on x86_64 only.

Link: http://lkml.kernel.org/r/1467197808-13578-1-git-send-email-namhyung@kernel.org

Cc: Ingo Molnar <mingo@kernel.org>
Cc: linux-arch@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h         | 11 +++++++++++
 kernel/trace/trace_entries.h |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c1de3f493cd3..f783df416726 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -80,6 +80,12 @@ enum trace_type {
 	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
 		     filter)
 
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print,	\
+			    filter)					\
+	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+		     filter) __packed
+
 #include "trace_entries.h"
 
 /*
@@ -1625,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter)	\
 	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
 		     filter)
+#undef FTRACE_ENTRY_PACKED
+#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \
+	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
+		     filter)
+
 #include "trace_entries.h"
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ee7b94a4810a..5c30efcda5e6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
 );
 
 /* Function call entry */
-FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
+FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
 
 	TRACE_GRAPH_ENT,
 
@@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
 );
 
 /* Function return entry */
-FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
+FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
 
 	TRACE_GRAPH_RET,
 
-- 
cgit v1.2.3


From ecb23dc6f2eff0ce64dd60351a81f376f13b12cc Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 20 May 2016 09:26:48 +0200
Subject: xen: add steal_clock support on x86

The pv_time_ops structure contains a function pointer for the
"steal_clock" functionality used only by KVM and Xen on ARM. Xen on x86
uses its own mechanism to account for the "stolen" time a thread wasn't
able to run due to hypervisor scheduling.

Add support in Xen arch independent time handling for this feature by
moving it out of the arm arch into drivers/xen and remove the x86 Xen
hack.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 kernel/sched/cputime.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 75f98c5498d5..8c4c6dcc052c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -489,16 +489,6 @@ void account_process_tick(struct task_struct *p, int user_tick)
 		account_idle_time(cputime_one_jiffy);
 }
 
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
-	account_steal_time(jiffies_to_cputime(ticks));
-}
-
 /*
  * Account multiple ticks of idle time.
  * @ticks: number of stolen ticks
-- 
cgit v1.2.3


From 606274c5abd8e245add01bc7145a8cbb92b69ba8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Jul 2016 22:38:36 -0700
Subject: bpf: introduce bpf_get_current_task() helper

over time there were multiple requests to access different data
structures and fields of task_struct current, so finally add
the helper to access 'current' as-is. Tracing bpf programs will do
the rest of walking the pointers via bpf_probe_read().
Note that current can be null and bpf program has to deal it with,
but even dumb passing null into bpf_probe_read() is still safe.

Suggested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 19c5b4a5c3eb..094c716154ed 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -312,6 +312,17 @@ const struct bpf_func_proto *bpf_get_event_output_proto(void)
 	return &bpf_event_output_proto;
 }
 
+static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return (long) current;
+}
+
+static const struct bpf_func_proto bpf_get_current_task_proto = {
+	.func		= bpf_get_current_task,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+};
+
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -329,6 +340,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_get_current_pid_tgid:
 		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_get_current_task:
+		return &bpf_get_current_task_proto;
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_current_comm:
-- 
cgit v1.2.3


From 2f88e41a22ccfa95291c4df573f8ed4c6a71f29b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 6 Jul 2016 02:40:56 +0200
Subject: PM / hibernate: Add missing braces in hibernate_setup()

Make hibernate_setup() follow the coding style more closely by adding
some missing braces to the if () statement in it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 126e24caa82e..b00f270d328e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1119,11 +1119,11 @@ static int __init resume_offset_setup(char *str)
 
 static int __init hibernate_setup(char *str)
 {
-	if (!strncmp(str, "noresume", 8))
+	if (!strncmp(str, "noresume", 8)) {
 		noresume = 1;
-	else if (!strncmp(str, "nocompress", 10))
+	} else if (!strncmp(str, "nocompress", 10)) {
 		nocompress = 1;
-	else if (!strncmp(str, "no", 2)) {
+	} else if (!strncmp(str, "no", 2)) {
 		noresume = 1;
 		nohibernate = 1;
 	}
-- 
cgit v1.2.3


From efd5a85242e996275ebf3df71013beabd723bda3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 6 Jul 2016 23:42:46 +0200
Subject: PM / hibernate: Clean up function headers in snapshot.c

The formatting of some function headers in kernel/power/snapshot.c
is not consistent with the general kernel coding style and with the
formatting of some other function headers in the same file.

Make all of them follow the same formatting convention.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 93 ++++++++++++++++++++++---------------------------
 1 file changed, 42 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 94b6fe6c9ae3..1fe0ddb6fd0d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -186,8 +186,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
 	__free_page(page);
 }
 
-static inline void
-free_list_of_pages(struct linked_page *list, int clear_page_nosave)
+static inline void free_list_of_pages(struct linked_page *list,
+				      int clear_page_nosave)
 {
 	while (list) {
 		struct linked_page *lp = list->next;
@@ -219,8 +219,8 @@ struct chain_allocator {
 	int safe_needed;	/* if set, only "safe" pages are allocated */
 };
 
-static void
-chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask,
+		       int safe_needed)
 {
 	ca->chain = NULL;
 	ca->used_space = LINKED_PAGE_DATA_SIZE;
@@ -452,10 +452,11 @@ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
  *	This function also allocated and builds the radix tree for the
  *	zone.
  */
-static struct mem_zone_bm_rtree *
-create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
-		     struct chain_allocator *ca,
-		     unsigned long start, unsigned long end)
+static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
+						      int safe_needed,
+						      struct chain_allocator *ca,
+						      unsigned long start,
+						      unsigned long end)
 {
 	struct mem_zone_bm_rtree *zone;
 	unsigned int i, nr_blocks;
@@ -595,8 +596,8 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 /**
   *	memory_bm_create - allocate memory for a memory bitmap
   */
-static int
-memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
+			    int safe_needed)
 {
 	struct chain_allocator ca;
 	struct list_head mem_extents;
@@ -894,9 +895,8 @@ static void memory_bm_recycle(struct memory_bitmap *bm)
  *	initialization code)
  */
 
-void __init
-__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
-			 int use_kmalloc)
+void __init __register_nosave_region(unsigned long start_pfn,
+				     unsigned long end_pfn, int use_kmalloc)
 {
 	struct nosave_region *region;
 
@@ -1277,8 +1277,7 @@ static void safe_copy_page(void *dst, struct page *s_page)
 
 
 #ifdef CONFIG_HIGHMEM
-static inline struct page *
-page_is_saveable(struct zone *zone, unsigned long pfn)
+static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn)
 {
 	return is_highmem(zone) ?
 		saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
@@ -1321,8 +1320,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 }
 #endif /* CONFIG_HIGHMEM */
 
-static void
-copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+static void copy_data_pages(struct memory_bitmap *copy_bm,
+			    struct memory_bitmap *orig_bm)
 {
 	struct zone *zone;
 	unsigned long pfn;
@@ -1485,8 +1484,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
 }
 
 static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
-						unsigned long highmem,
-						unsigned long total)
+						  unsigned long highmem,
+						  unsigned long total)
 {
 	unsigned long alloc = __fraction(nr_pages, highmem, total);
 
@@ -1499,8 +1498,8 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
 }
 
 static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
-						unsigned long highmem,
-						unsigned long total)
+							 unsigned long highmem,
+							 unsigned long total)
 {
 	return 0;
 }
@@ -1780,8 +1779,7 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
 	return nr_highmem;
 }
 #else
-static unsigned int
-count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
 #endif /* CONFIG_HIGHMEM */
 
 /**
@@ -1823,8 +1821,8 @@ static inline int get_highmem_buffer(int safe_needed)
  *	highmem pages is lesser than that, allocate them all.
  */
 
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+					       unsigned int nr_highmem)
 {
 	unsigned int to_alloc = count_free_highmem_pages();
 
@@ -1843,8 +1841,8 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
 #else
 static inline int get_highmem_buffer(int safe_needed) { return 0; }
 
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+					       unsigned int n) { return 0; }
 #endif /* CONFIG_HIGHMEM */
 
 /**
@@ -1859,9 +1857,9 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
  *	copy_data_pages() works.
  */
 
-static int
-swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
-		unsigned int nr_pages, unsigned int nr_highmem)
+static int swsusp_alloc(struct memory_bitmap *orig_bm,
+			struct memory_bitmap *copy_bm,
+			unsigned int nr_pages, unsigned int nr_highmem)
 {
 	if (nr_highmem > 0) {
 		if (get_highmem_buffer(PG_ANY))
@@ -1978,8 +1976,7 @@ static int init_header(struct swsusp_info *info)
  *	are stored in the array @buf[] (1 page at a time)
  */
 
-static inline void
-pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
 
@@ -2110,8 +2107,7 @@ static int check_header(struct swsusp_info *info)
  *	load header - check the image header and copy data from it
  */
 
-static int
-load_header(struct swsusp_info *info)
+static int load_header(struct swsusp_info *info)
 {
 	int error;
 
@@ -2204,8 +2200,8 @@ static unsigned int safe_highmem_pages;
 
 static struct memory_bitmap *safe_highmem_bm;
 
-static int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+static int prepare_highmem_image(struct memory_bitmap *bm,
+				 unsigned int *nr_highmem_p)
 {
 	unsigned int to_alloc;
 
@@ -2259,8 +2255,8 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
 
 static struct page *last_highmem_page;
 
-static void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static void *get_highmem_page_buffer(struct page *page,
+				     struct chain_allocator *ca)
 {
 	struct highmem_pbe *pbe;
 	void *kaddr;
@@ -2333,17 +2329,13 @@ static inline void free_highmem_data(void)
 		free_image_page(buffer, PG_UNSAFE_CLEAR);
 }
 #else
-static unsigned int
-count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
 
-static inline int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
-{
-	return 0;
-}
+static inline int prepare_highmem_image(struct memory_bitmap *bm,
+					unsigned int *nr_highmem_p) { return 0; }
 
-static inline void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static inline void *get_highmem_page_buffer(struct page *page,
+					    struct chain_allocator *ca)
 {
 	return ERR_PTR(-EINVAL);
 }
@@ -2369,8 +2361,7 @@ static inline void free_highmem_data(void) {}
 
 #define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
 
-static int
-prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
+static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
 	unsigned int nr_pages, nr_highmem;
 	struct linked_page *lp;
@@ -2593,8 +2584,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle)
 
 #ifdef CONFIG_HIGHMEM
 /* Assumes that @buf is ready and points to a "safe" page */
-static inline void
-swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
+static inline void swap_two_pages_data(struct page *p1, struct page *p2,
+				       void *buf)
 {
 	void *kaddr1, *kaddr2;
 
-- 
cgit v1.2.3


From ef96f639ea663474c4e1c57bd64e118ffbb92be4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 6 Jul 2016 23:43:46 +0200
Subject: PM / hibernate: Clean up comments in snapshot.c

Many comments in kernel/power/snapshot.c do not follow the general
comment formatting rules.  They look odd, some of them are outdated
too, some are hard to parse and generally difficult to understand.

Clean them up to make them easier to comprehend.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 636 +++++++++++++++++++++++++-----------------------
 1 file changed, 330 insertions(+), 306 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1fe0ddb6fd0d..bd927d9efeb7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -67,7 +67,8 @@ void __init hibernate_image_size_init(void)
 	image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
 }
 
-/* List of PBEs needed for restoring the pages that were allocated before
+/*
+ * List of PBEs needed for restoring the pages that were allocated before
  * the suspend and included in the suspend image, but have also been
  * allocated by the "resume" kernel, so their contents cannot be written
  * directly to their "original" page frames.
@@ -93,16 +94,6 @@ static struct linked_page *safe_pages_list;
 /* Pointer to an auxiliary buffer (1 page) */
 static void *buffer;
 
-/**
- *	@safe_needed - on resume, for storing the PBE list and the image,
- *	we can only use memory pages that do not conflict with the pages
- *	used before suspend.  The unsafe pages have PageNosaveFree set
- *	and we count them using unsafe_pages.
- *
- *	Each allocated image page is marked as PageNosave and PageNosaveFree
- *	so that swsusp_free() can release it.
- */
-
 #define PG_ANY		0
 #define PG_SAFE		1
 #define PG_UNSAFE_CLEAR	1
@@ -110,6 +101,19 @@ static void *buffer;
 
 static unsigned int allocated_unsafe_pages;
 
+/**
+ * get_image_page - Allocate a page for a hibernation image.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages that were not used before hibernation (restore only)
+ *
+ * During image restoration, for storing the PBE list and the image data, we can
+ * only use memory pages that do not conflict with the pages used before
+ * hibernation.  The "unsafe" pages have PageNosaveFree set and we count them
+ * using allocated_unsafe_pages.
+ *
+ * Each allocated image page is marked as PageNosave and PageNosaveFree so that
+ * swsusp_free() can release it.
+ */
 static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 {
 	void *res;
@@ -167,10 +171,13 @@ static void recycle_safe_page(void *page_address)
 }
 
 /**
- *	free_image_page - free page represented by @addr, allocated with
- *	get_image_page (page flags set by it must be cleared)
+ * free_image_page - Free a page allocated for hibernation image.
+ * @addr: Address of the page to free.
+ * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page.
+ *
+ * The page to free should have been allocated by get_image_page() (page flags
+ * set by it are affected).
  */
-
 static inline void free_image_page(void *addr, int clear_nosave_free)
 {
 	struct page *page;
@@ -197,24 +204,22 @@ static inline void free_list_of_pages(struct linked_page *list,
 	}
 }
 
-/**
-  *	struct chain_allocator is used for allocating small objects out of
-  *	a linked list of pages called 'the chain'.
-  *
-  *	The chain grows each time when there is no room for a new object in
-  *	the current page.  The allocated objects cannot be freed individually.
-  *	It is only possible to free them all at once, by freeing the entire
-  *	chain.
-  *
-  *	NOTE: The chain allocator may be inefficient if the allocated objects
-  *	are not much smaller than PAGE_SIZE.
-  */
-
+/*
+ * struct chain_allocator is used for allocating small objects out of
+ * a linked list of pages called 'the chain'.
+ *
+ * The chain grows each time when there is no room for a new object in
+ * the current page.  The allocated objects cannot be freed individually.
+ * It is only possible to free them all at once, by freeing the entire
+ * chain.
+ *
+ * NOTE: The chain allocator may be inefficient if the allocated objects
+ * are not much smaller than PAGE_SIZE.
+ */
 struct chain_allocator {
 	struct linked_page *chain;	/* the chain */
 	unsigned int used_space;	/* total size of objects allocated out
-					 * of the current page
-					 */
+					   of the current page */
 	gfp_t gfp_mask;		/* mask for allocating pages */
 	int safe_needed;	/* if set, only "safe" pages are allocated */
 };
@@ -250,44 +255,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 }
 
 /**
- *	Data types related to memory bitmaps.
+ * Data types related to memory bitmaps.
  *
- *	Memory bitmap is a structure consiting of many linked lists of
- *	objects.  The main list's elements are of type struct zone_bitmap
- *	and each of them corresonds to one zone.  For each zone bitmap
- *	object there is a list of objects of type struct bm_block that
- *	represent each blocks of bitmap in which information is stored.
+ * Memory bitmap is a structure consiting of many linked lists of
+ * objects.  The main list's elements are of type struct zone_bitmap
+ * and each of them corresonds to one zone.  For each zone bitmap
+ * object there is a list of objects of type struct bm_block that
+ * represent each blocks of bitmap in which information is stored.
  *
- *	struct memory_bitmap contains a pointer to the main list of zone
- *	bitmap objects, a struct bm_position used for browsing the bitmap,
- *	and a pointer to the list of pages used for allocating all of the
- *	zone bitmap objects and bitmap block objects.
+ * struct memory_bitmap contains a pointer to the main list of zone
+ * bitmap objects, a struct bm_position used for browsing the bitmap,
+ * and a pointer to the list of pages used for allocating all of the
+ * zone bitmap objects and bitmap block objects.
  *
- *	NOTE: It has to be possible to lay out the bitmap in memory
- *	using only allocations of order 0.  Additionally, the bitmap is
- *	designed to work with arbitrary number of zones (this is over the
- *	top for now, but let's avoid making unnecessary assumptions ;-).
+ * NOTE: It has to be possible to lay out the bitmap in memory
+ * using only allocations of order 0.  Additionally, the bitmap is
+ * designed to work with arbitrary number of zones (this is over the
+ * top for now, but let's avoid making unnecessary assumptions ;-).
  *
- *	struct zone_bitmap contains a pointer to a list of bitmap block
- *	objects and a pointer to the bitmap block object that has been
- *	most recently used for setting bits.  Additionally, it contains the
- *	pfns that correspond to the start and end of the represented zone.
+ * struct zone_bitmap contains a pointer to a list of bitmap block
+ * objects and a pointer to the bitmap block object that has been
+ * most recently used for setting bits.  Additionally, it contains the
+ * PFNs that correspond to the start and end of the represented zone.
  *
- *	struct bm_block contains a pointer to the memory page in which
- *	information is stored (in the form of a block of bitmap)
- *	It also contains the pfns that correspond to the start and end of
- *	the represented memory area.
+ * struct bm_block contains a pointer to the memory page in which
+ * information is stored (in the form of a block of bitmap)
+ * It also contains the pfns that correspond to the start and end of
+ * the represented memory area.
  *
- *	The memory bitmap is organized as a radix tree to guarantee fast random
- *	access to the bits. There is one radix tree for each zone (as returned
- *	from create_mem_extents).
+ * The memory bitmap is organized as a radix tree to guarantee fast random
+ * access to the bits. There is one radix tree for each zone (as returned
+ * from create_mem_extents).
  *
- *	One radix tree is represented by one struct mem_zone_bm_rtree. There are
- *	two linked lists for the nodes of the tree, one for the inner nodes and
- *	one for the leave nodes. The linked leave nodes are used for fast linear
- *	access of the memory bitmap.
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * two linked lists for the nodes of the tree, one for the inner nodes and
+ * one for the leave nodes. The linked leave nodes are used for fast linear
+ * access of the memory bitmap.
  *
- *	The struct rtree_node represents one node of the radix tree.
+ * The struct rtree_node represents one node of the radix tree.
  */
 
 #define BM_END_OF_MAP	(~0UL)
@@ -333,9 +338,8 @@ struct bm_position {
 struct memory_bitmap {
 	struct list_head zones;
 	struct linked_page *p_list;	/* list of pages used to store zone
-					 * bitmap objects and bitmap block
-					 * objects
-					 */
+					   bitmap objects and bitmap block
+					   objects */
 	struct bm_position cur;	/* most recently used bit position */
 };
 
@@ -349,12 +353,12 @@ struct memory_bitmap {
 #endif
 #define BM_RTREE_LEVEL_MASK	((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
 
-/*
- *	alloc_rtree_node - Allocate a new node and add it to the radix tree.
+/**
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
  *
- *	This function is used to allocate inner nodes as well as the
- *	leave nodes of the radix tree. It also adds the node to the
- *	corresponding linked list passed in by the *list parameter.
+ * This function is used to allocate inner nodes as well as the
+ * leave nodes of the radix tree. It also adds the node to the
+ * corresponding linked list passed in by the *list parameter.
  */
 static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
 					   struct chain_allocator *ca,
@@ -375,12 +379,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
 	return node;
 }
 
-/*
- *	add_rtree_block - Add a new leave node to the radix tree
+/**
+ * add_rtree_block - Add a new leave node to the radix tree.
  *
- *	The leave nodes need to be allocated in order to keep the leaves
- *	linked list in order. This is guaranteed by the zone->blocks
- *	counter.
+ * The leave nodes need to be allocated in order to keep the leaves
+ * linked list in order. This is guaranteed by the zone->blocks
+ * counter.
  */
 static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
 			   int safe_needed, struct chain_allocator *ca)
@@ -445,12 +449,12 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
 static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
 			       int clear_nosave_free);
 
-/*
- *	create_zone_bm_rtree - create a radix tree for one zone
+/**
+ * create_zone_bm_rtree - Create a radix tree for one zone.
  *
- *	Allocated the mem_zone_bm_rtree structure and initializes it.
- *	This function also allocated and builds the radix tree for the
- *	zone.
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
+ * This function also allocated and builds the radix tree for the
+ * zone.
  */
 static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
 						      int safe_needed,
@@ -483,12 +487,12 @@ static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
 	return zone;
 }
 
-/*
- *	free_zone_bm_rtree - Free the memory of the radix tree
+/**
+ * free_zone_bm_rtree - Free the memory of the radix tree.
  *
- *	Free all node pages of the radix tree. The mem_zone_bm_rtree
- *	structure itself is not freed here nor are the rtree_node
- *	structs.
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * structure itself is not freed here nor are the rtree_node
+ * structs.
  */
 static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
 			       int clear_nosave_free)
@@ -521,8 +525,8 @@ struct mem_extent {
 };
 
 /**
- *	free_mem_extents - free a list of memory extents
- *	@list - list of extents to empty
+ * free_mem_extents - Free a list of memory extents.
+ * @list: List of extents to free.
  */
 static void free_mem_extents(struct list_head *list)
 {
@@ -535,10 +539,11 @@ static void free_mem_extents(struct list_head *list)
 }
 
 /**
- *	create_mem_extents - create a list of memory extents representing
- *	                     contiguous ranges of PFNs
- *	@list - list to put the extents into
- *	@gfp_mask - mask to use for memory allocations
+ * create_mem_extents - Create a list of memory extents.
+ * @list: List to put the extents into.
+ * @gfp_mask: Mask to use for memory allocations.
+ *
+ * The extents represent contiguous ranges of PFNs.
  */
 static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 {
@@ -594,8 +599,8 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 }
 
 /**
-  *	memory_bm_create - allocate memory for a memory bitmap
-  */
+ * memory_bm_create - Allocate memory for a memory bitmap.
+ */
 static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
 			    int safe_needed)
 {
@@ -636,8 +641,9 @@ static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
 }
 
 /**
-  *	memory_bm_free - free memory occupied by the memory bitmap @bm
-  */
+ * memory_bm_free - Free memory occupied by the memory bitmap.
+ * @bm: Memory bitmap.
+ */
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 {
 	struct mem_zone_bm_rtree *zone;
@@ -651,14 +657,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 }
 
 /**
- *	memory_bm_find_bit - Find the bit for pfn in the memory
- *			     bitmap
+ * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap.
  *
- *	Find the bit in the bitmap @bm that corresponds to given pfn.
- *	The cur.zone, cur.block and cur.node_pfn member of @bm are
- *	updated.
- *	It walks the radix tree to find the page which contains the bit for
- *	pfn and returns the bit position in **addr and *bit_nr.
+ * Find the bit in memory bitmap @bm that corresponds to the given PFN.
+ * The cur.zone, cur.block and cur.node_pfn members of @bm are updated.
+ *
+ * Walk the radix tree to find the page containing the bit that represents @pfn
+ * and return the position of the bit in @addr and @bit_nr.
  */
 static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
 			      void **addr, unsigned int *bit_nr)
@@ -687,10 +692,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
 
 zone_found:
 	/*
-	 * We have a zone. Now walk the radix tree to find the leave
-	 * node for our pfn.
+	 * We have found the zone. Now walk the radix tree to find the leaf node
+	 * for our PFN.
 	 */
-
 	node = bm->cur.node;
 	if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
 		goto node_found;
@@ -783,14 +787,14 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
 }
 
 /*
- *	rtree_next_node - Jumps to the next leave node
+ * rtree_next_node - Jump to the next leaf node.
  *
- *	Sets the position to the beginning of the next node in the
- *	memory bitmap. This is either the next node in the current
- *	zone's radix tree or the first node in the radix tree of the
- *	next zone.
+ * Set the position to the beginning of the next node in the
+ * memory bitmap. This is either the next node in the current
+ * zone's radix tree or the first node in the radix tree of the
+ * next zone.
  *
- *	Returns true if there is a next node, false otherwise.
+ * Return true if there is a next node, false otherwise.
  */
 static bool rtree_next_node(struct memory_bitmap *bm)
 {
@@ -819,14 +823,15 @@ static bool rtree_next_node(struct memory_bitmap *bm)
 }
 
 /**
- *	memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap.
+ * @bm: Memory bitmap.
  *
- *	Starting from the last returned position this function searches
- *	for the next set bit in the memory bitmap and returns its
- *	number. If no more bit is set BM_END_OF_MAP is returned.
+ * Starting from the last returned position this function searches for the next
+ * set bit in @bm and returns the PFN represented by it.  If no more bits are
+ * set, BM_END_OF_MAP is returned.
  *
- *	It is required to run memory_bm_position_reset() before the
- *	first call to this function.
+ * It is required to run memory_bm_position_reset() before the first call to
+ * this function for the given memory bitmap.
  */
 static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
@@ -848,11 +853,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 	return BM_END_OF_MAP;
 }
 
-/**
- *	This structure represents a range of page frames the contents of which
- *	should not be saved during the suspend.
+/*
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during hibernation.
  */
-
 struct nosave_region {
 	struct list_head list;
 	unsigned long start_pfn;
@@ -890,11 +894,11 @@ static void memory_bm_recycle(struct memory_bitmap *bm)
 }
 
 /**
- *	register_nosave_region - register a range of page frames the contents
- *	of which should not be saved during the suspend (to be used in the early
- *	initialization code)
+ * register_nosave_region - Register a region of unsaveable memory.
+ *
+ * Register a range of page frames the contents of which should not be saved
+ * during hibernation (to be used in the early initialization code).
  */
-
 void __init __register_nosave_region(unsigned long start_pfn,
 				     unsigned long end_pfn, int use_kmalloc)
 {
@@ -913,7 +917,7 @@ void __init __register_nosave_region(unsigned long start_pfn,
 		}
 	}
 	if (use_kmalloc) {
-		/* during init, this shouldn't fail */
+		/* During init, this shouldn't fail */
 		region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
 		BUG_ON(!region);
 	} else
@@ -979,10 +983,12 @@ static void swsusp_unset_page_forbidden(struct page *page)
 }
 
 /**
- *	mark_nosave_pages - set bits corresponding to the page frames the
- *	contents of which should not be saved in a given bitmap.
+ * mark_nosave_pages - Mark pages that should not be saved.
+ * @bm: Memory bitmap.
+ *
+ * Set the bits in @bm that correspond to the page frames the contents of which
+ * should not be saved.
  */
-
 static void mark_nosave_pages(struct memory_bitmap *bm)
 {
 	struct nosave_region *region;
@@ -1012,13 +1018,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 }
 
 /**
- *	create_basic_memory_bitmaps - create bitmaps needed for marking page
- *	frames that should not be saved and free page frames.  The pointers
- *	forbidden_pages_map and free_pages_map are only modified if everything
- *	goes well, because we don't want the bits to be used before both bitmaps
- *	are set up.
+ * create_basic_memory_bitmaps - Create bitmaps to hold basic page information.
+ *
+ * Create bitmaps needed for marking page frames that should not be saved and
+ * free page frames.  The forbidden_pages_map and free_pages_map pointers are
+ * only modified if everything goes well, because we don't want the bits to be
+ * touched before both bitmaps are set up.
  */
-
 int create_basic_memory_bitmaps(void)
 {
 	struct memory_bitmap *bm1, *bm2;
@@ -1063,12 +1069,12 @@ int create_basic_memory_bitmaps(void)
 }
 
 /**
- *	free_basic_memory_bitmaps - free memory bitmaps allocated by
- *	create_basic_memory_bitmaps().  The auxiliary pointers are necessary
- *	so that the bitmaps themselves are not referred to while they are being
- *	freed.
+ * free_basic_memory_bitmaps - Free memory bitmaps holding basic information.
+ *
+ * Free memory bitmaps allocated by create_basic_memory_bitmaps().  The
+ * auxiliary pointers are necessary so that the bitmaps themselves are not
+ * referred to while they are being freed.
  */
-
 void free_basic_memory_bitmaps(void)
 {
 	struct memory_bitmap *bm1, *bm2;
@@ -1089,11 +1095,13 @@ void free_basic_memory_bitmaps(void)
 }
 
 /**
- *	snapshot_additional_pages - estimate the number of additional pages
- *	be needed for setting up the suspend image data structures for given
- *	zone (usually the returned value is greater than the exact number)
+ * snapshot_additional_pages - Estimate the number of extra pages needed.
+ * @zone: Memory zone to carry out the computation for.
+ *
+ * Estimate the number of additional pages needed for setting up a hibernation
+ * image data structures for @zone (usually, the returned value is greater than
+ * the exact number).
  */
-
 unsigned int snapshot_additional_pages(struct zone *zone)
 {
 	unsigned int rtree, nodes;
@@ -1111,10 +1119,10 @@ unsigned int snapshot_additional_pages(struct zone *zone)
 
 #ifdef CONFIG_HIGHMEM
 /**
- *	count_free_highmem_pages - compute the total number of free highmem
- *	pages, system-wide.
+ * count_free_highmem_pages - Compute the total number of free highmem pages.
+ *
+ * The returned number is system-wide.
  */
-
 static unsigned int count_free_highmem_pages(void)
 {
 	struct zone *zone;
@@ -1128,11 +1136,12 @@ static unsigned int count_free_highmem_pages(void)
 }
 
 /**
- *	saveable_highmem_page - Determine whether a highmem page should be
- *	included in the suspend image.
+ * saveable_highmem_page - Check if a highmem page is saveable.
+ *
+ * Determine whether a highmem page should be included in a hibernation image.
  *
- *	We should save the page if it isn't Nosave or NosaveFree, or Reserved,
- *	and it isn't a part of a free chunk of pages.
+ * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ * and it isn't part of a free chunk of pages.
  */
 static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 {
@@ -1158,10 +1167,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 }
 
 /**
- *	count_highmem_pages - compute the total number of saveable highmem
- *	pages.
+ * count_highmem_pages - Compute the total number of saveable highmem pages.
  */
-
 static unsigned int count_highmem_pages(void)
 {
 	struct zone *zone;
@@ -1189,12 +1196,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
 #endif /* CONFIG_HIGHMEM */
 
 /**
- *	saveable_page - Determine whether a non-highmem page should be included
- *	in the suspend image.
+ * saveable_page - Check if the given page is saveable.
+ *
+ * Determine whether a non-highmem page should be included in a hibernation
+ * image.
  *
- *	We should save the page if it isn't Nosave, and is not in the range
- *	of pages statically defined as 'unsaveable', and it isn't a part of
- *	a free chunk of pages.
+ * We should save the page if it isn't Nosave, and is not in the range
+ * of pages statically defined as 'unsaveable', and it isn't part of
+ * a free chunk of pages.
  */
 static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 {
@@ -1223,10 +1232,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 }
 
 /**
- *	count_data_pages - compute the total number of saveable non-highmem
- *	pages.
+ * count_data_pages - Compute the total number of saveable non-highmem pages.
  */
-
 static unsigned int count_data_pages(void)
 {
 	struct zone *zone;
@@ -1246,7 +1253,8 @@ static unsigned int count_data_pages(void)
 	return n;
 }
 
-/* This is needed, because copy_page and memcpy are not usable for copying
+/*
+ * This is needed, because copy_page and memcpy are not usable for copying
  * task structs.
  */
 static inline void do_copy_page(long *dst, long *src)
@@ -1257,12 +1265,12 @@ static inline void do_copy_page(long *dst, long *src)
 		*dst++ = *src++;
 }
 
-
 /**
- *	safe_copy_page - check if the page we are going to copy is marked as
- *		present in the kernel page tables (this always is the case if
- *		CONFIG_DEBUG_PAGEALLOC is not set and in that case
- *		kernel_page_present() always returns 'true').
+ * safe_copy_page - Copy a page in a safe way.
+ *
+ * Check if the page we are going to copy is marked as present in the kernel
+ * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
+ * and in that case kernel_page_present() always returns 'true').
  */
 static void safe_copy_page(void *dst, struct page *s_page)
 {
@@ -1275,7 +1283,6 @@ static void safe_copy_page(void *dst, struct page *s_page)
 	}
 }
 
-
 #ifdef CONFIG_HIGHMEM
 static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn)
 {
@@ -1298,7 +1305,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 		kunmap_atomic(src);
 	} else {
 		if (PageHighMem(d_page)) {
-			/* Page pointed to by src may contain some kernel
+			/*
+			 * The page pointed to by src may contain some kernel
 			 * data modified by kmap_atomic()
 			 */
 			safe_copy_page(buffer, s_page);
@@ -1370,12 +1378,11 @@ static struct memory_bitmap orig_bm;
 static struct memory_bitmap copy_bm;
 
 /**
- *	swsusp_free - free pages allocated for the suspend.
+ * swsusp_free - Free pages allocated for hibernation image.
  *
- *	Suspend pages are alocated before the atomic copy is made, so we
- *	need to release them after the resume.
+ * Image pages are alocated before snapshot creation, so they need to be
+ * released after resume.
  */
-
 void swsusp_free(void)
 {
 	unsigned long fb_pfn, fr_pfn;
@@ -1424,7 +1431,7 @@ out:
 #define GFP_IMAGE	(GFP_KERNEL | __GFP_NOWARN)
 
 /**
- * preallocate_image_pages - Allocate a number of pages for hibernation image
+ * preallocate_image_pages - Allocate a number of pages for hibernation image.
  * @nr_pages: Number of page frames to allocate.
  * @mask: GFP flags to use for the allocation.
  *
@@ -1474,7 +1481,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)
 }
 
 /**
- *  __fraction - Compute (an approximation of) x * (multiplier / base)
+ *  __fraction - Compute (an approximation of) x * (multiplier / base).
  */
 static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
 {
@@ -1506,7 +1513,7 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
 #endif /* CONFIG_HIGHMEM */
 
 /**
- * free_unnecessary_pages - Release preallocated pages not needed for the image
+ * free_unnecessary_pages - Release preallocated pages not needed for the image.
  */
 static unsigned long free_unnecessary_pages(void)
 {
@@ -1560,7 +1567,7 @@ static unsigned long free_unnecessary_pages(void)
 }
 
 /**
- * minimum_image_size - Estimate the minimum acceptable size of an image
+ * minimum_image_size - Estimate the minimum acceptable size of an image.
  * @saveable: Number of saveable pages in the system.
  *
  * We want to avoid attempting to free too much memory too hard, so estimate the
@@ -1590,7 +1597,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
 }
 
 /**
- * hibernate_preallocate_memory - Preallocate memory for hibernation image
+ * hibernate_preallocate_memory - Preallocate memory for hibernation image.
  *
  * To create a hibernation image it is necessary to make a copy of every page
  * frame in use.  We also need a number of page frames to be free during
@@ -1763,10 +1770,11 @@ int hibernate_preallocate_memory(void)
 
 #ifdef CONFIG_HIGHMEM
 /**
-  *	count_pages_for_highmem - compute the number of non-highmem pages
-  *	that will be necessary for creating copies of highmem pages.
-  */
-
+ * count_pages_for_highmem - Count non-highmem pages needed for copying highmem.
+ *
+ * Compute the number of non-highmem pages that will be necessary for creating
+ * copies of highmem pages.
+ */
 static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
 {
 	unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
@@ -1783,10 +1791,8 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0;
 #endif /* CONFIG_HIGHMEM */
 
 /**
- *	enough_free_mem - Make sure we have enough free memory for the
- *	snapshot image.
+ * enough_free_mem - Check if there is enough free memory for the image.
  */
-
 static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 {
 	struct zone *zone;
@@ -1805,10 +1811,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 
 #ifdef CONFIG_HIGHMEM
 /**
- *	get_highmem_buffer - if there are some highmem pages in the suspend
- *	image, we may need the buffer to copy them and/or load their data.
+ * get_highmem_buffer - Allocate a buffer for highmem pages.
+ *
+ * If there are some highmem pages in the hibernation image, we may need a
+ * buffer to copy them and/or load their data.
  */
-
 static inline int get_highmem_buffer(int safe_needed)
 {
 	buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
@@ -1816,11 +1823,11 @@ static inline int get_highmem_buffer(int safe_needed)
 }
 
 /**
- *	alloc_highmem_image_pages - allocate some highmem pages for the image.
- *	Try to allocate as many pages as needed, but if the number of free
- *	highmem pages is lesser than that, allocate them all.
+ * alloc_highmem_image_pages - Allocate some highmem pages for the image.
+ *
+ * Try to allocate as many pages as needed, but if the number of free highmem
+ * pages is less than that, allocate them all.
  */
-
 static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
 					       unsigned int nr_highmem)
 {
@@ -1846,17 +1853,16 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
 #endif /* CONFIG_HIGHMEM */
 
 /**
- *	swsusp_alloc - allocate memory for the suspend image
+ * swsusp_alloc - Allocate memory for hibernation image.
  *
- *	We first try to allocate as many highmem pages as there are
- *	saveable highmem pages in the system.  If that fails, we allocate
- *	non-highmem pages for the copies of the remaining highmem ones.
+ * We first try to allocate as many highmem pages as there are
+ * saveable highmem pages in the system.  If that fails, we allocate
+ * non-highmem pages for the copies of the remaining highmem ones.
  *
- *	In this approach it is likely that the copies of highmem pages will
- *	also be located in the high memory, because of the way in which
- *	copy_data_pages() works.
+ * In this approach it is likely that the copies of highmem pages will
+ * also be located in the high memory, because of the way in which
+ * copy_data_pages() works.
  */
-
 static int swsusp_alloc(struct memory_bitmap *orig_bm,
 			struct memory_bitmap *copy_bm,
 			unsigned int nr_pages, unsigned int nr_highmem)
@@ -1909,7 +1915,8 @@ asmlinkage __visible int swsusp_save(void)
 		return -ENOMEM;
 	}
 
-	/* During allocating of suspend pagedir, new cold pages may appear.
+	/*
+	 * During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
 	 */
 	drain_local_pages(NULL);
@@ -1972,10 +1979,13 @@ static int init_header(struct swsusp_info *info)
 }
 
 /**
- *	pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
- *	are stored in the array @buf[] (1 page at a time)
+ * pack_pfns - Prepare PFNs for saving.
+ * @bm: Memory bitmap.
+ * @buf: Memory buffer to store the PFNs in.
+ *
+ * PFNs corresponding to set bits in @bm are stored in the area of memory
+ * pointed to by @buf (1 page at a time).
  */
-
 static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
@@ -1990,22 +2000,21 @@ static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 }
 
 /**
- *	snapshot_read_next - used for reading the system memory snapshot.
+ * snapshot_read_next - Get the address to read the next image page from.
+ * @handle: Snapshot handle to be used for the reading.
  *
- *	On the first call to it @handle should point to a zeroed
- *	snapshot_handle structure.  The structure gets updated and a pointer
- *	to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure.  The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
  *
- *	On success the function returns a positive number.  Then, the caller
- *	is allowed to read up to the returned number of bytes from the memory
- *	location computed by the data_of() macro.
+ * On success, the function returns a positive number.  Then, the caller
+ * is allowed to read up to the returned number of bytes from the memory
+ * location computed by the data_of() macro.
  *
- *	The function returns 0 to indicate the end of data stream condition,
- *	and a negative number is returned on error.  In such cases the
- *	structure pointed to by @handle is not updated and should not be used
- *	any more.
+ * The function returns 0 to indicate the end of the data stream condition,
+ * and negative numbers are returned on errors.  If that happens, the structure
+ * pointed to by @handle is not updated and should not be used any more.
  */
-
 int snapshot_read_next(struct snapshot_handle *handle)
 {
 	if (handle->cur > nr_meta_pages + nr_copy_pages)
@@ -2034,7 +2043,8 @@ int snapshot_read_next(struct snapshot_handle *handle)
 
 		page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
 		if (PageHighMem(page)) {
-			/* Highmem pages are copied to the buffer,
+			/*
+			 * Highmem pages are copied to the buffer,
 			 * because we can't return with a kmapped
 			 * highmem page (we may not be called again).
 			 */
@@ -2066,11 +2076,11 @@ static void duplicate_memory_bitmap(struct memory_bitmap *dst,
 }
 
 /**
- *	mark_unsafe_pages - mark the pages that cannot be used for storing
- *	the image during resume, because they conflict with the pages that
- *	had been used before suspend
+ * mark_unsafe_pages - Mark pages that were used before hibernation.
+ *
+ * Mark the pages that cannot be used for storing the image during restoration,
+ * because they conflict with the pages that had been used before hibernation.
  */
-
 static void mark_unsafe_pages(struct memory_bitmap *bm)
 {
 	unsigned long pfn;
@@ -2104,9 +2114,8 @@ static int check_header(struct swsusp_info *info)
 }
 
 /**
- *	load header - check the image header and copy data from it
+ * load header - Check the image header and copy the data from it.
  */
-
 static int load_header(struct swsusp_info *info)
 {
 	int error;
@@ -2121,8 +2130,12 @@ static int load_header(struct swsusp_info *info)
 }
 
 /**
- *	unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
- *	the corresponding bit in the memory bitmap @bm
+ * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap.
+ * @bm: Memory bitmap.
+ * @buf: Area of memory containing the PFNs.
+ *
+ * For each element of the array pointed to by @buf (1 page at a time), set the
+ * corresponding bit in @bm.
  */
 static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
@@ -2145,7 +2158,8 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 }
 
 #ifdef CONFIG_HIGHMEM
-/* struct highmem_pbe is used for creating the list of highmem pages that
+/*
+ * struct highmem_pbe is used for creating the list of highmem pages that
  * should be restored atomically during the resume from disk, because the page
  * frames they have occupied before the suspend are in use.
  */
@@ -2155,7 +2169,8 @@ struct highmem_pbe {
 	struct highmem_pbe *next;
 };
 
-/* List of highmem PBEs needed for restoring the highmem pages that were
+/*
+ * List of highmem PBEs needed for restoring the highmem pages that were
  * allocated before the suspend and included in the suspend image, but have
  * also been allocated by the "resume" kernel, so their contents cannot be
  * written directly to their "original" page frames.
@@ -2163,11 +2178,11 @@ struct highmem_pbe {
 static struct highmem_pbe *highmem_pblist;
 
 /**
- *	count_highmem_image_pages - compute the number of highmem pages in the
- *	suspend image.  The bits in the memory bitmap @bm that correspond to the
- *	image pages are assumed to be set.
+ * count_highmem_image_pages - Compute the number of highmem pages in the image.
+ * @bm: Memory bitmap.
+ *
+ * The bits in @bm that correspond to image pages are assumed to be set.
  */
-
 static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
 {
 	unsigned long pfn;
@@ -2184,22 +2199,23 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
 	return cnt;
 }
 
-/**
- *	prepare_highmem_image - try to allocate as many highmem pages as
- *	there are highmem image pages (@nr_highmem_p points to the variable
- *	containing the number of highmem image pages).  The pages that are
- *	"safe" (ie. will not be overwritten when the suspend image is
- *	restored) have the corresponding bits set in @bm (it must be
- *	unitialized).
- *
- *	NOTE: This function should not be called if there are no highmem
- *	image pages.
- */
-
 static unsigned int safe_highmem_pages;
 
 static struct memory_bitmap *safe_highmem_bm;
 
+/**
+ * prepare_highmem_image - Allocate memory for loading highmem data from image.
+ * @bm: Pointer to an uninitialized memory bitmap structure.
+ * @nr_highmem_p: Pointer to the number of highmem image pages.
+ *
+ * Try to allocate as many highmem pages as there are highmem image pages
+ * (@nr_highmem_p points to the variable containing the number of highmem image
+ * pages).  The pages that are "safe" (ie. will not be overwritten when the
+ * hibernation image is restored entirely) have the corresponding bits set in
+ * @bm (it must be unitialized).
+ *
+ * NOTE: This function should not be called if there are no highmem image pages.
+ */
 static int prepare_highmem_image(struct memory_bitmap *bm,
 				 unsigned int *nr_highmem_p)
 {
@@ -2236,25 +2252,26 @@ static int prepare_highmem_image(struct memory_bitmap *bm,
 	return 0;
 }
 
+static struct page *last_highmem_page;
+
 /**
- *	get_highmem_page_buffer - for given highmem image page find the buffer
- *	that suspend_write_next() should set for its caller to write to.
+ * get_highmem_page_buffer - Prepare a buffer to store a highmem image page.
+ *
+ * For a given highmem image page get a buffer that suspend_write_next() should
+ * return to its caller to write to.
  *
- *	If the page is to be saved to its "original" page frame or a copy of
- *	the page is to be made in the highmem, @buffer is returned.  Otherwise,
- *	the copy of the page is to be made in normal memory, so the address of
- *	the copy is returned.
+ * If the page is to be saved to its "original" page frame or a copy of
+ * the page is to be made in the highmem, @buffer is returned.  Otherwise,
+ * the copy of the page is to be made in normal memory, so the address of
+ * the copy is returned.
  *
- *	If @buffer is returned, the caller of suspend_write_next() will write
- *	the page's contents to @buffer, so they will have to be copied to the
- *	right location on the next call to suspend_write_next() and it is done
- *	with the help of copy_last_highmem_page().  For this purpose, if
- *	@buffer is returned, @last_highmem page is set to the page to which
- *	the data will have to be copied from @buffer.
+ * If @buffer is returned, the caller of suspend_write_next() will write
+ * the page's contents to @buffer, so they will have to be copied to the
+ * right location on the next call to suspend_write_next() and it is done
+ * with the help of copy_last_highmem_page().  For this purpose, if
+ * @buffer is returned, @last_highmem_page is set to the page to which
+ * the data will have to be copied from @buffer.
  */
-
-static struct page *last_highmem_page;
-
 static void *get_highmem_page_buffer(struct page *page,
 				     struct chain_allocator *ca)
 {
@@ -2262,13 +2279,15 @@ static void *get_highmem_page_buffer(struct page *page,
 	void *kaddr;
 
 	if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
-		/* We have allocated the "original" page frame and we can
+		/*
+		 * We have allocated the "original" page frame and we can
 		 * use it directly to store the loaded page.
 		 */
 		last_highmem_page = page;
 		return buffer;
 	}
-	/* The "original" page frame has not been allocated and we have to
+	/*
+	 * The "original" page frame has not been allocated and we have to
 	 * use a "safe" page frame to store the loaded page.
 	 */
 	pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
@@ -2298,11 +2317,12 @@ static void *get_highmem_page_buffer(struct page *page,
 }
 
 /**
- *	copy_last_highmem_page - copy the contents of a highmem image from
- *	@buffer, where the caller of snapshot_write_next() has place them,
- *	to the right location represented by @last_highmem_page .
+ * copy_last_highmem_page - Copy most the most recent highmem image page.
+ *
+ * Copy the contents of a highmem image from @buffer, where the caller of
+ * snapshot_write_next() has stored them, to the right location represented by
+ * @last_highmem_page .
  */
-
 static void copy_last_highmem_page(void)
 {
 	if (last_highmem_page) {
@@ -2345,22 +2365,23 @@ static inline int last_highmem_page_copied(void) { return 1; }
 static inline void free_highmem_data(void) {}
 #endif /* CONFIG_HIGHMEM */
 
+#define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
+
 /**
- *	prepare_image - use the memory bitmap @bm to mark the pages that will
- *	be overwritten in the process of restoring the system memory state
- *	from the suspend image ("unsafe" pages) and allocate memory for the
- *	image.
+ * prepare_image - Make room for loading hibernation image.
+ * @new_bm: Unitialized memory bitmap structure.
+ * @bm: Memory bitmap with unsafe pages marked.
+ *
+ * Use @bm to mark the pages that will be overwritten in the process of
+ * restoring the system memory state from the suspend image ("unsafe" pages)
+ * and allocate memory for the image.
  *
- *	The idea is to allocate a new memory bitmap first and then allocate
- *	as many pages as needed for the image data, but not to assign these
- *	pages to specific tasks initially.  Instead, we just mark them as
- *	allocated and create a lists of "safe" pages that will be used
- *	later.  On systems with high memory a list of "safe" highmem pages is
- *	also created.
+ * The idea is to allocate a new memory bitmap first and then allocate
+ * as many pages as needed for image data, but without specifying what those
+ * pages will be used for just yet.  Instead, we mark them all as allocated and
+ * create a lists of "safe" pages to be used later.  On systems with high
+ * memory a list of "safe" highmem pages is created too.
  */
-
-#define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-
 static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
 	unsigned int nr_pages, nr_highmem;
@@ -2385,7 +2406,8 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 		if (error)
 			goto Free;
 	}
-	/* Reserve some safe pages for potential later use.
+	/*
+	 * Reserve some safe pages for potential later use.
 	 *
 	 * NOTE: This way we make sure there will be enough safe pages for the
 	 * chain_alloc() in get_buffer().  It is a bit wasteful, but
@@ -2431,10 +2453,11 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 }
 
 /**
- *	get_buffer - compute the address that snapshot_write_next() should
- *	set for its caller to write to.
+ * get_buffer - Get the address to store the next image data page.
+ *
+ * Get the address that snapshot_write_next() should return to its caller to
+ * write to.
  */
-
 static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 {
 	struct pbe *pbe;
@@ -2449,12 +2472,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 		return get_highmem_page_buffer(page, ca);
 
 	if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
-		/* We have allocated the "original" page frame and we can
+		/*
+		 * We have allocated the "original" page frame and we can
 		 * use it directly to store the loaded page.
 		 */
 		return page_address(page);
 
-	/* The "original" page frame has not been allocated and we have to
+	/*
+	 * The "original" page frame has not been allocated and we have to
 	 * use a "safe" page frame to store the loaded page.
 	 */
 	pbe = chain_alloc(ca, sizeof(struct pbe));
@@ -2471,22 +2496,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 }
 
 /**
- *	snapshot_write_next - used for writing the system memory snapshot.
+ * snapshot_write_next - Get the address to store the next image page.
+ * @handle: Snapshot handle structure to guide the writing.
  *
- *	On the first call to it @handle should point to a zeroed
- *	snapshot_handle structure.  The structure gets updated and a pointer
- *	to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure.  The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
  *
- *	On success the function returns a positive number.  Then, the caller
- *	is allowed to write up to the returned number of bytes to the memory
- *	location computed by the data_of() macro.
+ * On success, the function returns a positive number.  Then, the caller
+ * is allowed to write up to the returned number of bytes to the memory
+ * location computed by the data_of() macro.
  *
- *	The function returns 0 to indicate the "end of file" condition,
- *	and a negative number is returned on error.  In such cases the
- *	structure pointed to by @handle is not updated and should not be used
- *	any more.
+ * The function returns 0 to indicate the "end of file" condition.  Negative
+ * numbers are returned on errors, in which cases the structure pointed to by
+ * @handle is not updated and should not be used any more.
  */
-
 int snapshot_write_next(struct snapshot_handle *handle)
 {
 	static struct chain_allocator ca;
@@ -2556,13 +2580,13 @@ int snapshot_write_next(struct snapshot_handle *handle)
 }
 
 /**
- *	snapshot_write_finalize - must be called after the last call to
- *	snapshot_write_next() in case the last page in the image happens
- *	to be a highmem page and its contents should be stored in the
- *	highmem.  Additionally, it releases the memory that will not be
- *	used any more.
+ * snapshot_write_finalize - Complete the loading of a hibernation image.
+ *
+ * Must be called after the last call to snapshot_write_next() in case the last
+ * page in the image happens to be a highmem page and its contents should be
+ * stored in highmem.  Additionally, it recycles bitmap memory that's not
+ * necessary any more.
  */
-
 void snapshot_write_finalize(struct snapshot_handle *handle)
 {
 	copy_last_highmem_page();
@@ -2599,15 +2623,15 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2,
 }
 
 /**
- *	restore_highmem - for each highmem page that was allocated before
- *	the suspend and included in the suspend image, and also has been
- *	allocated by the "resume" kernel swap its current (ie. "before
- *	resume") contents with the previous (ie. "before suspend") one.
+ * restore_highmem - Put highmem image pages into their original locations.
+ *
+ * For each highmem page that was in use before hibernation and is included in
+ * the image, and also has been allocated by the "restore" kernel, swap its
+ * current contents with the previous (ie. "before hibernation") ones.
  *
- *	If the resume eventually fails, we can call this function once
- *	again and restore the "before resume" highmem state.
+ * If the restore eventually fails, we can call this function once again and
+ * restore the highmem state as seen by the restore kernel.
  */
-
 int restore_highmem(void)
 {
 	struct highmem_pbe *pbe = highmem_pblist;
-- 
cgit v1.2.3


From d5f32af3100165cbd625855bd155b3aa9bd87ebf Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 6 Jul 2016 23:44:31 +0200
Subject: PM / hibernate: Add missing braces in __register_nosave_region()

One branch of an if/else statement in __register_nosave_region() is
formatted against the kernel coding style which causes the code to
look slightly odd.  To fix that, add missing braces to it.

No functional changes.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index bd927d9efeb7..d64d5d0efa79 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -920,9 +920,10 @@ void __init __register_nosave_region(unsigned long start_pfn,
 		/* During init, this shouldn't fail */
 		region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
 		BUG_ON(!region);
-	} else
+	} else {
 		/* This allocation cannot fail */
 		region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
+	}
 	region->start_pfn = start_pfn;
 	region->end_pfn = end_pfn;
 	list_add_tail(&region->list, &nosave_regions);
-- 
cgit v1.2.3


From 4c0b6c10fbaf0c82efe2a7ba6c236c633d4f2ed7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 10 Jul 2016 02:12:10 +0200
Subject: PM / hibernate: Image data protection during restoration

Make it possible to protect all pages holding image data during
hibernate image restoration by setting them read-only (so as to
catch attempts to write to those pages after image data have been
stored in them).

This adds overhead to image restoration code (it may cause large
page mappings to be split as a result of page flags changes) and
the errors it protects against should never happen in theory, so
the feature is only active after passing hibernate=protect_image
to the command line of the restore kernel.

Also it only is built if CONFIG_DEBUG_RODATA is set.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c |  3 +++
 kernel/power/power.h     |  7 +++++++
 kernel/power/snapshot.c  | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b00f270d328e..51441d87f0b6 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1126,6 +1126,9 @@ static int __init hibernate_setup(char *str)
 	} else if (!strncmp(str, "no", 2)) {
 		noresume = 1;
 		nohibernate = 1;
+	} else if (IS_ENABLED(CONFIG_DEBUG_RODATA)
+		   && !strncmp(str, "protect_image", 13)) {
+		enable_restore_image_protection();
 	}
 	return 1;
 }
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 51f02ecaf125..064963e89194 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -59,6 +59,13 @@ extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
 
+#ifdef CONFIG_DEBUG_RODATA
+/* kernel/power/snapshot.c */
+extern void enable_restore_image_protection(void);
+#else
+static inline void enable_restore_image_protection(void) {}
+#endif /* CONFIG_DEBUG_RODATA */
+
 #else /* !CONFIG_HIBERNATION */
 
 static inline void hibernate_reserved_size_init(void) {}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d64d5d0efa79..d90df926b59f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -38,6 +38,43 @@
 
 #include "power.h"
 
+#ifdef CONFIG_DEBUG_RODATA
+static bool hibernate_restore_protection;
+static bool hibernate_restore_protection_active;
+
+void enable_restore_image_protection(void)
+{
+	hibernate_restore_protection = true;
+}
+
+static inline void hibernate_restore_protection_begin(void)
+{
+	hibernate_restore_protection_active = hibernate_restore_protection;
+}
+
+static inline void hibernate_restore_protection_end(void)
+{
+	hibernate_restore_protection_active = false;
+}
+
+static inline void hibernate_restore_protect_page(void *page_address)
+{
+	if (hibernate_restore_protection_active)
+		set_memory_ro((unsigned long)page_address, 1);
+}
+
+static inline void hibernate_restore_unprotect_page(void *page_address)
+{
+	if (hibernate_restore_protection_active)
+		set_memory_rw((unsigned long)page_address, 1);
+}
+#else
+static inline void hibernate_restore_protection_begin(void) {}
+static inline void hibernate_restore_protection_end(void) {}
+static inline void hibernate_restore_protect_page(void *page_address) {}
+static inline void hibernate_restore_unprotect_page(void *page_address) {}
+#endif /* CONFIG_DEBUG_RODATA */
+
 static int swsusp_page_is_free(struct page *);
 static void swsusp_set_page_forbidden(struct page *);
 static void swsusp_unset_page_forbidden(struct page *);
@@ -1414,6 +1451,7 @@ loop:
 
 		memory_bm_clear_current(forbidden_pages_map);
 		memory_bm_clear_current(free_pages_map);
+		hibernate_restore_unprotect_page(page_address(page));
 		__free_page(page);
 		goto loop;
 	}
@@ -1425,6 +1463,7 @@ out:
 	buffer = NULL;
 	alloc_normal = 0;
 	alloc_highmem = 0;
+	hibernate_restore_protection_end();
 }
 
 /* Helper functions used for the shrinking of memory. */
@@ -2548,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
 		if (error)
 			return error;
 
+		hibernate_restore_protection_begin();
 	} else if (handle->cur <= nr_meta_pages + 1) {
 		error = unpack_orig_pfns(buffer, &copy_bm);
 		if (error)
@@ -2570,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle)
 		copy_last_highmem_page();
 		/* Restore page key for data page (s390 only). */
 		page_key_write(handle->buffer);
+		hibernate_restore_protect_page(handle->buffer);
 		handle->buffer = get_buffer(&orig_bm, &ca);
 		if (IS_ERR(handle->buffer))
 			return PTR_ERR(handle->buffer);
@@ -2594,6 +2635,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
 	/* Restore page key for data page (s390 only). */
 	page_key_write(handle->buffer);
 	page_key_free();
+	hibernate_restore_protect_page(handle->buffer);
 	/* Do that only if we have loaded the image entirely */
 	if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
 		memory_bm_recycle(&orig_bm);
-- 
cgit v1.2.3


From a536a6e13ecd0d6eb0ffc36c5d56555896617282 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 11 Jul 2016 12:51:01 -0400
Subject: bpf: make inode code explicitly non-modular

The Kconfig currently controlling compilation of this code is:

init/Kconfig:config BPF_SYSCALL
init/Kconfig:   bool "Enable bpf() system call"

...meaning that it currently is not being built as a module by anyone.

Lets remove the couple traces of modular infrastructure use, so that
when reading the driver there is no doubt it is builtin-only.

Note that MODULE_ALIAS is a no-op for non-modular code.

We replace module.h with init.h since the file does use __init.

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: netdev@vger.kernel.org
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 318858edb1cd..5967b870a895 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -11,7 +11,7 @@
  * version 2 as published by the Free Software Foundation.
  */
 
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/magic.h>
 #include <linux/major.h>
 #include <linux/mount.h>
@@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = {
 	.kill_sb	= kill_litter_super,
 };
 
-MODULE_ALIAS_FS("bpf");
-
 static int __init bpf_init(void)
 {
 	int ret;
-- 
cgit v1.2.3


From 6a4e24518c8a10f78f44da219835239cb5aca90d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2016 17:16:03 +0000
Subject: cpu/hotplug: Handle early registration gracefully

We switched the hotplug machinery to smpboot threads. Early registration of
hotplug callbacks, i.e. from do_pre_smp_initcalls(), happens before the
threads are initialized. Instead of moving the thread init, we simply handle
it in the hotplug code itself and invoke the function directly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153332.896450738@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7b61887f7ccd..fe71ce4e60f1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
 	if (!cpu_online(cpu))
 		return 0;
 
+	/*
+	 * If we are up and running, use the hotplug thread. For early calls
+	 * we invoke the thread function directly.
+	 */
+	if (!st->thread)
+		return cpuhp_invoke_callback(cpu, state, cb);
+
 	st->cb_state = state;
 	st->cb = cb;
 	/*
-- 
cgit v1.2.3


From 00e16c3d68fce504e880f59c9bdf23b2a4759d6d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2016 17:16:09 +0000
Subject: perf/core: Convert to hotplug state machine

Actually a nice symmetric startup/teardown pair which fits properly into
the state machine concept. In the long run we should be able to invoke
the startup callback for the boot CPU via the state machine and get
rid of the init function which invokes it on the boot CPU.

Note: This comes actually before the perf hardware callbacks. In the notifier
model the hardware callbacks have a higher priority than the core
callback. But that's solely for CPU offline so that hardware migration of
events happens before the core is notified about the outgoing CPU.

With the symetric state array model we have the following ordering:

 UP:     core -> hardware
 DOWN:   hardware -> core

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153333.587514098@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c         | 11 +++++++++++
 kernel/events/core.c | 56 ++++++++++------------------------------------------
 2 files changed, 21 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe71ce4e60f1..3705d9043c08 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1180,6 +1180,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.teardown		= NULL,
 		.cant_stop		= true,
 	},
+	[CPUHP_PERF_PREPARE] = {
+		.name = "perf prepare",
+		.startup = perf_event_init_cpu,
+		.teardown = perf_event_exit_cpu,
+	},
 	/*
 	 * Preparatory and dead notifiers. Will be replaced once the notifiers
 	 * are converted to states.
@@ -1257,6 +1262,12 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.startup		= smpboot_unpark_threads,
 		.teardown		= NULL,
 	},
+	[CPUHP_AP_PERF_ONLINE] = {
+		.name = "perf online",
+		.startup = perf_event_init_cpu,
+		.teardown = perf_event_exit_cpu,
+	},
+
 	/*
 	 * Online/down_prepare notifiers. Will be removed once the notifiers
 	 * are converted to states.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 43d43a2d5811..f3ef1c29a7c9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10255,7 +10255,7 @@ static void __init perf_event_init_all_cpus(void)
 	}
 }
 
-static void perf_event_init_cpu(int cpu)
+int perf_event_init_cpu(unsigned int cpu)
 {
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
@@ -10268,6 +10268,7 @@ static void perf_event_init_cpu(int cpu)
 		rcu_assign_pointer(swhash->swevent_hlist, hlist);
 	}
 	mutex_unlock(&swhash->hlist_mutex);
+	return 0;
 }
 
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10299,14 +10300,17 @@ static void perf_event_exit_cpu_context(int cpu)
 	}
 	srcu_read_unlock(&pmus_srcu, idx);
 }
+#else
+
+static void perf_event_exit_cpu_context(int cpu) { }
+
+#endif
 
-static void perf_event_exit_cpu(int cpu)
+int perf_event_exit_cpu(unsigned int cpu)
 {
 	perf_event_exit_cpu_context(cpu);
+	return 0;
 }
-#else
-static inline void perf_event_exit_cpu(int cpu) { }
-#endif
 
 static int
 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
@@ -10328,46 +10332,6 @@ static struct notifier_block perf_reboot_notifier = {
 	.priority = INT_MIN,
 };
 
-static int
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-
-	case CPU_UP_PREPARE:
-		/*
-		 * This must be done before the CPU comes alive, because the
-		 * moment we can run tasks we can encounter (software) events.
-		 *
-		 * Specifically, someone can have inherited events on kthreadd
-		 * or a pre-existing worker thread that gets re-bound.
-		 */
-		perf_event_init_cpu(cpu);
-		break;
-
-	case CPU_DOWN_PREPARE:
-		/*
-		 * This must be done before the CPU dies because after that an
-		 * active event might want to IPI the CPU and that'll not work
-		 * so great for dead CPUs.
-		 *
-		 * XXX smp_call_function_single() return -ENXIO without a warn
-		 * so we could possibly deal with this.
-		 *
-		 * This is safe against new events arriving because
-		 * sys_perf_event_open() serializes against hotplug using
-		 * get_online_cpus().
-		 */
-		perf_event_exit_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
 void __init perf_event_init(void)
 {
 	int ret;
@@ -10380,7 +10344,7 @@ void __init perf_event_init(void)
 	perf_pmu_register(&perf_cpu_clock, NULL, -1);
 	perf_pmu_register(&perf_task_clock, NULL, -1);
 	perf_tp_register();
-	perf_cpu_notifier(perf_cpu_notify);
+	perf_event_init_cpu(smp_processor_id());
 	register_reboot_notifier(&perf_reboot_notifier);
 
 	ret = init_hw_breakpoint();
-- 
cgit v1.2.3


From 7ee681b25284782ecf380bf5ccf55f13c52fd0ce Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2016 17:16:29 +0000
Subject: workqueue: Convert to state machine callbacks

Get rid of the prio ordering of the separate notifiers and use a proper state
callback pair.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153335.197083890@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c       |  10 +++++
 kernel/workqueue.c | 108 +++++++++++++++++++++--------------------------------
 2 files changed, 53 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3705d9043c08..af53f820fec9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1185,6 +1185,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = perf_event_init_cpu,
 		.teardown = perf_event_exit_cpu,
 	},
+	[CPUHP_WORKQUEUE_PREP] = {
+		.name = "workqueue prepare",
+		.startup = workqueue_prepare_cpu,
+		.teardown = NULL,
+	},
 	/*
 	 * Preparatory and dead notifiers. Will be replaced once the notifiers
 	 * are converted to states.
@@ -1267,6 +1272,11 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.startup = perf_event_init_cpu,
 		.teardown = perf_event_exit_cpu,
 	},
+	[CPUHP_AP_WORKQUEUE_ONLINE] = {
+		.name = "workqueue online",
+		.startup = workqueue_online_cpu,
+		.teardown = workqueue_offline_cpu,
+	},
 
 	/*
 	 * Online/down_prepare notifiers. Will be removed once the notifiers
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e1c0e996b5ae..c9dd5fbdbf33 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4611,84 +4611,65 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
 						  pool->attrs->cpumask) < 0);
 }
 
-/*
- * Workqueues should be brought up before normal priority CPU notifiers.
- * This will be registered high priority CPU notifier.
- */
-static int workqueue_cpu_up_callback(struct notifier_block *nfb,
-					       unsigned long action,
-					       void *hcpu)
+int workqueue_prepare_cpu(unsigned int cpu)
+{
+	struct worker_pool *pool;
+
+	for_each_cpu_worker_pool(pool, cpu) {
+		if (pool->nr_workers)
+			continue;
+		if (!create_worker(pool))
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+int workqueue_online_cpu(unsigned int cpu)
 {
-	int cpu = (unsigned long)hcpu;
 	struct worker_pool *pool;
 	struct workqueue_struct *wq;
 	int pi;
 
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		for_each_cpu_worker_pool(pool, cpu) {
-			if (pool->nr_workers)
-				continue;
-			if (!create_worker(pool))
-				return NOTIFY_BAD;
-		}
-		break;
-
-	case CPU_DOWN_FAILED:
-	case CPU_ONLINE:
-		mutex_lock(&wq_pool_mutex);
+	mutex_lock(&wq_pool_mutex);
 
-		for_each_pool(pool, pi) {
-			mutex_lock(&pool->attach_mutex);
+	for_each_pool(pool, pi) {
+		mutex_lock(&pool->attach_mutex);
 
-			if (pool->cpu == cpu)
-				rebind_workers(pool);
-			else if (pool->cpu < 0)
-				restore_unbound_workers_cpumask(pool, cpu);
+		if (pool->cpu == cpu)
+			rebind_workers(pool);
+		else if (pool->cpu < 0)
+			restore_unbound_workers_cpumask(pool, cpu);
 
-			mutex_unlock(&pool->attach_mutex);
-		}
+		mutex_unlock(&pool->attach_mutex);
+	}
 
-		/* update NUMA affinity of unbound workqueues */
-		list_for_each_entry(wq, &workqueues, list)
-			wq_update_unbound_numa(wq, cpu, true);
+	/* update NUMA affinity of unbound workqueues */
+	list_for_each_entry(wq, &workqueues, list)
+		wq_update_unbound_numa(wq, cpu, true);
 
-		mutex_unlock(&wq_pool_mutex);
-		break;
-	}
-	return NOTIFY_OK;
+	mutex_unlock(&wq_pool_mutex);
+	return 0;
 }
 
-/*
- * Workqueues should be brought down after normal priority CPU notifiers.
- * This will be registered as low priority CPU notifier.
- */
-static int workqueue_cpu_down_callback(struct notifier_block *nfb,
-						 unsigned long action,
-						 void *hcpu)
+int workqueue_offline_cpu(unsigned int cpu)
 {
-	int cpu = (unsigned long)hcpu;
 	struct work_struct unbind_work;
 	struct workqueue_struct *wq;
 
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_PREPARE:
-		/* unbinding per-cpu workers should happen on the local CPU */
-		INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
-		queue_work_on(cpu, system_highpri_wq, &unbind_work);
-
-		/* update NUMA affinity of unbound workqueues */
-		mutex_lock(&wq_pool_mutex);
-		list_for_each_entry(wq, &workqueues, list)
-			wq_update_unbound_numa(wq, cpu, false);
-		mutex_unlock(&wq_pool_mutex);
-
-		/* wait for per-cpu unbinding to finish */
-		flush_work(&unbind_work);
-		destroy_work_on_stack(&unbind_work);
-		break;
-	}
-	return NOTIFY_OK;
+	/* unbinding per-cpu workers should happen on the local CPU */
+	INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
+	queue_work_on(cpu, system_highpri_wq, &unbind_work);
+
+	/* update NUMA affinity of unbound workqueues */
+	mutex_lock(&wq_pool_mutex);
+	list_for_each_entry(wq, &workqueues, list)
+		wq_update_unbound_numa(wq, cpu, false);
+	mutex_unlock(&wq_pool_mutex);
+
+	/* wait for per-cpu unbinding to finish */
+	flush_work(&unbind_work);
+	destroy_work_on_stack(&unbind_work);
+	return 0;
 }
 
 #ifdef CONFIG_SMP
@@ -5490,9 +5471,6 @@ static int __init init_workqueues(void)
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
-	hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
-
 	wq_numa_init();
 
 	/* initialize CPU pools */
-- 
cgit v1.2.3


From 0b7a0fdb29715e38641beb39db4d01695b22b5aa Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Thu, 14 Jul 2016 10:59:19 -0400
Subject: audit: fix whitespace in CWD record

Fix the whitespace in the CWD record

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
[PM: fixed subject line]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ec4c552876a7..aa3feec4df14 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1430,7 +1430,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	if (context->pwd.dentry && context->pwd.mnt) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
 		if (ab) {
-			audit_log_d_path(ab, " cwd=", &context->pwd);
+			audit_log_d_path(ab, "cwd=", &context->pwd);
 			audit_log_end(ab);
 		}
 	}
-- 
cgit v1.2.3


From 27590dc17b34aedc4f3e14bd107ee59b9db9b0a6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Jul 2016 10:41:04 +0200
Subject: hrtimer: Convert to hotplug state machine

Split out the clockevents callbacks instead of piggybacking them on
hrtimers.

This gets rid of a POST_DEAD user. See commit:

  54e88fad223c ("sched: Make sure timers have migrated before killing the migration_thread")

We just move the callback state to the proper place in the state machine.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153337.485419196@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c          |  5 +++++
 kernel/time/hrtimer.c | 40 +++++-----------------------------------
 2 files changed, 10 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index af53f820fec9..85500e7fe238 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1190,6 +1190,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = workqueue_prepare_cpu,
 		.teardown = NULL,
 	},
+	[CPUHP_HRTIMERS_PREPARE] = {
+		.name = "hrtimers prepare",
+		.startup = hrtimers_prepare_cpu,
+		.teardown = hrtimers_dead_cpu,
+	},
 	/*
 	 * Preparatory and dead notifiers. Will be replaced once the notifiers
 	 * are converted to states.
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d13c9aebf7a3..9ba7c820fc23 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1590,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 /*
  * Functions related to boot-time initialization:
  */
-static void init_hrtimers_cpu(int cpu)
+int hrtimers_prepare_cpu(unsigned int cpu)
 {
 	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
 	int i;
@@ -1602,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu)
 
 	cpu_base->cpu = cpu;
 	hrtimer_init_hres(cpu_base);
+	return 0;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1636,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 	}
 }
 
-static void migrate_hrtimers(int scpu)
+int hrtimers_dead_cpu(unsigned int scpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
 	int i;
@@ -1665,45 +1666,14 @@ static void migrate_hrtimers(int scpu)
 	/* Check, if we got expired work to do */
 	__hrtimer_peek_ahead_timers();
 	local_irq_enable();
+	return 0;
 }
 
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int hrtimer_cpu_notify(struct notifier_block *self,
-					unsigned long action, void *hcpu)
-{
-	int scpu = (long)hcpu;
-
-	switch (action) {
-
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		init_hrtimers_cpu(scpu);
-		break;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		migrate_hrtimers(scpu);
-		break;
-#endif
-
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block hrtimers_nb = {
-	.notifier_call = hrtimer_cpu_notify,
-};
-
 void __init hrtimers_init(void)
 {
-	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
-			  (void *)(long)smp_processor_id());
-	register_cpu_notifier(&hrtimers_nb);
+	hrtimers_prepare_cpu(smp_processor_id());
 }
 
 /**
-- 
cgit v1.2.3


From 24f73b99716a9cd8cbb345c41ced6b3b5ed94006 Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Wed, 13 Jul 2016 17:16:59 +0000
Subject: timers/core: Convert to hotplug state machine

When tearing down, call timers_dead_cpu() before notify_dead().
There is a hidden dependency between:

 - timers
 - block multiqueue
 - rcutree

If timers_dead_cpu() comes later than blk_mq_queue_reinit_notify()
that latter function causes a RCU stall.

Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153337.566790058@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c        |  5 +++++
 kernel/time/timer.c | 25 ++-----------------------
 2 files changed, 7 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 85500e7fe238..e1017d92d308 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1195,6 +1195,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = hrtimers_prepare_cpu,
 		.teardown = hrtimers_dead_cpu,
 	},
+	[CPUHP_TIMERS_DEAD] = {
+		.name = "timers dead",
+		.startup = NULL,
+		.teardown = timers_dead_cpu,
+	},
 	/*
 	 * Preparatory and dead notifiers. Will be replaced once the notifiers
 	 * are converted to states.
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index cb9ab401e2d9..555670a5143c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1804,7 +1804,7 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
 	}
 }
 
-static void migrate_timers(int cpu)
+int timers_dead_cpu(unsigned int cpu)
 {
 	struct timer_base *old_base;
 	struct timer_base *new_base;
@@ -1831,29 +1831,9 @@ static void migrate_timers(int cpu)
 		spin_unlock_irq(&new_base->lock);
 		put_cpu_ptr(&timer_bases);
 	}
+	return 0;
 }
 
-static int timer_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
-{
-	switch (action) {
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		migrate_timers((long)hcpu);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static inline void timer_register_cpu_notifier(void)
-{
-	cpu_notifier(timer_cpu_notify, 0);
-}
-#else
-static inline void timer_register_cpu_notifier(void) { }
 #endif /* CONFIG_HOTPLUG_CPU */
 
 static void __init init_timer_cpu(int cpu)
@@ -1881,7 +1861,6 @@ void __init init_timers(void)
 {
 	init_timer_cpus();
 	init_timer_stats();
-	timer_register_cpu_notifier();
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
 
-- 
cgit v1.2.3


From e722d8daafb974b9ad1bbaf42f384a5ea5929f5f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 13 Jul 2016 17:16:59 +0000
Subject: profile: Convert to hotplug state machine

Install the callbacks via the state machine and let the core invoke
the callbacks on the already online CPUs. A lot of code is removed because
the for-loop is used and create_hash_tables() is removed since its purpose
is covered by the startup / teardown hooks.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153337.649867675@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/profile.c | 181 ++++++++++++++++++++-----------------------------------
 1 file changed, 65 insertions(+), 116 deletions(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index c2199e9901c9..2dbccf2d806c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -328,68 +328,57 @@ out:
 	put_cpu();
 }
 
-static int profile_cpu_callback(struct notifier_block *info,
-					unsigned long action, void *__cpu)
+static int profile_dead_cpu(unsigned int cpu)
 {
-	int node, cpu = (unsigned long)__cpu;
 	struct page *page;
+	int i;
 
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		node = cpu_to_mem(cpu);
-		per_cpu(cpu_profile_flip, cpu) = 0;
-		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = __alloc_pages_node(node,
-					GFP_KERNEL | __GFP_ZERO,
-					0);
-			if (!page)
-				return notifier_from_errno(-ENOMEM);
-			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
-		}
-		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = __alloc_pages_node(node,
-					GFP_KERNEL | __GFP_ZERO,
-					0);
-			if (!page)
-				goto out_free;
-			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
-		}
-		break;
-out_free:
-		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
-		per_cpu(cpu_profile_hits, cpu)[1] = NULL;
-		__free_page(page);
-		return notifier_from_errno(-ENOMEM);
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		if (prof_cpu_mask != NULL)
-			cpumask_set_cpu(cpu, prof_cpu_mask);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		if (prof_cpu_mask != NULL)
-			cpumask_clear_cpu(cpu, prof_cpu_mask);
-		if (per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
-			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
+	if (prof_cpu_mask != NULL)
+		cpumask_clear_cpu(cpu, prof_cpu_mask);
+
+	for (i = 0; i < 2; i++) {
+		if (per_cpu(cpu_profile_hits, cpu)[i]) {
+			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
+			per_cpu(cpu_profile_hits, cpu)[i] = NULL;
 			__free_page(page);
 		}
-		if (per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
-			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
-			__free_page(page);
+	}
+	return 0;
+}
+
+static int profile_prepare_cpu(unsigned int cpu)
+{
+	int i, node = cpu_to_mem(cpu);
+	struct page *page;
+
+	per_cpu(cpu_profile_flip, cpu) = 0;
+
+	for (i = 0; i < 2; i++) {
+		if (per_cpu(cpu_profile_hits, cpu)[i])
+			continue;
+
+		page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+		if (!page) {
+			profile_dead_cpu(cpu);
+			return -ENOMEM;
 		}
-		break;
+		per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
+
 	}
-	return NOTIFY_OK;
+	return 0;
+}
+
+static int profile_online_cpu(unsigned int cpu)
+{
+	if (prof_cpu_mask != NULL)
+		cpumask_set_cpu(cpu, prof_cpu_mask);
+
+	return 0;
 }
+
 #else /* !CONFIG_SMP */
 #define profile_flip_buffers()		do { } while (0)
 #define profile_discard_flip_buffers()	do { } while (0)
-#define profile_cpu_callback		NULL
 
 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
@@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = {
 	.llseek		= default_llseek,
 };
 
-#ifdef CONFIG_SMP
-static void profile_nop(void *unused)
-{
-}
-
-static int create_hash_tables(void)
+int __ref create_proc_profile(void)
 {
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		int node = cpu_to_mem(cpu);
-		struct page *page;
-
-		page = __alloc_pages_node(node,
-				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
-				0);
-		if (!page)
-			goto out_cleanup;
-		per_cpu(cpu_profile_hits, cpu)[1]
-				= (struct profile_hit *)page_address(page);
-		page = __alloc_pages_node(node,
-				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
-				0);
-		if (!page)
-			goto out_cleanup;
-		per_cpu(cpu_profile_hits, cpu)[0]
-				= (struct profile_hit *)page_address(page);
-	}
-	return 0;
-out_cleanup:
-	prof_on = 0;
-	smp_mb();
-	on_each_cpu(profile_nop, NULL, 1);
-	for_each_online_cpu(cpu) {
-		struct page *page;
-
-		if (per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
-			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
-			__free_page(page);
-		}
-		if (per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
-			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
-			__free_page(page);
-		}
-	}
-	return -1;
-}
-#else
-#define create_hash_tables()			({ 0; })
+	struct proc_dir_entry *entry;
+#ifdef CONFIG_SMP
+	enum cpuhp_state online_state;
 #endif
 
-int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
-{
-	struct proc_dir_entry *entry;
 	int err = 0;
 
 	if (!prof_on)
 		return 0;
-
-	cpu_notifier_register_begin();
-
-	if (create_hash_tables()) {
-		err = -ENOMEM;
-		goto out;
-	}
-
+#ifdef CONFIG_SMP
+	err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
+				profile_prepare_cpu, profile_dead_cpu);
+	if (err)
+		return err;
+
+	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
+				profile_online_cpu, NULL);
+	if (err < 0)
+		goto err_state_prep;
+	online_state = err;
+	err = 0;
+#endif
 	entry = proc_create("profile", S_IWUSR | S_IRUGO,
 			    NULL, &proc_profile_operations);
 	if (!entry)
-		goto out;
+		goto err_state_onl;
 	proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
-	__hotcpu_notifier(profile_cpu_callback, 0);
 
-out:
-	cpu_notifier_register_done();
+	return err;
+err_state_onl:
+#ifdef CONFIG_SMP
+	cpuhp_remove_state(online_state);
+err_state_prep:
+	cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
+#endif
 	return err;
 }
 subsys_initcall(create_proc_profile);
-- 
cgit v1.2.3


From 31487f8328f20fdb302430b020a5d6e8446c1971 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 13 Jul 2016 17:17:01 +0000
Subject: smp/cfd: Convert core to hotplug state machine

Install the callbacks via the state machine. They are installed at runtime so
smpcfd_prepare_cpu() needs to be invoked by the boot-CPU.

Signed-off-by: Richard Weinberger <richard@nod.at>
[ Added the dropped CPU dying case back in. ]
Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Davidlohr Bueso <dave@stgolabs>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153337.818376366@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c |  9 +++++++
 kernel/smp.c | 79 ++++++++++++++++++++++++------------------------------------
 2 files changed, 41 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index e1017d92d308..008e2fd40cb1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1195,6 +1195,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = hrtimers_prepare_cpu,
 		.teardown = hrtimers_dead_cpu,
 	},
+	[CPUHP_SMPCFD_PREPARE] = {
+		.name = "SMPCFD prepare",
+		.startup = smpcfd_prepare_cpu,
+		.teardown = smpcfd_dead_cpu,
+	},
 	[CPUHP_TIMERS_DEAD] = {
 		.name = "timers dead",
 		.startup = NULL,
@@ -1218,6 +1223,10 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.teardown		= NULL,
 		.cant_stop		= true,
 	},
+	[CPUHP_AP_SMPCFD_DYING] = {
+		.startup = NULL,
+		.teardown = smpcfd_dying_cpu,
+	},
 	/*
 	 * Handled on controll processor until the plugged processor manages
 	 * this itself.
diff --git a/kernel/smp.c b/kernel/smp.c
index 74165443c240..7180491c9678 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
 
 static void flush_smp_call_function_queue(bool warn_cpu_offline);
 
-static int
-hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
+int smpcfd_prepare_cpu(unsigned int cpu)
 {
-	long cpu = (long)hcpu;
 	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
 
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
-				cpu_to_node(cpu)))
-			return notifier_from_errno(-ENOMEM);
-		cfd->csd = alloc_percpu(struct call_single_data);
-		if (!cfd->csd) {
-			free_cpumask_var(cfd->cpumask);
-			return notifier_from_errno(-ENOMEM);
-		}
-		break;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		/* Fall-through to the CPU_DEAD[_FROZEN] case. */
-
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
+	if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+				     cpu_to_node(cpu)))
+		return -ENOMEM;
+	cfd->csd = alloc_percpu(struct call_single_data);
+	if (!cfd->csd) {
 		free_cpumask_var(cfd->cpumask);
-		free_percpu(cfd->csd);
-		break;
+		return -ENOMEM;
+	}
 
-	case CPU_DYING:
-	case CPU_DYING_FROZEN:
-		/*
-		 * The IPIs for the smp-call-function callbacks queued by other
-		 * CPUs might arrive late, either due to hardware latencies or
-		 * because this CPU disabled interrupts (inside stop-machine)
-		 * before the IPIs were sent. So flush out any pending callbacks
-		 * explicitly (without waiting for the IPIs to arrive), to
-		 * ensure that the outgoing CPU doesn't go offline with work
-		 * still pending.
-		 */
-		flush_smp_call_function_queue(false);
-		break;
-#endif
-	};
+	return 0;
+}
+
+int smpcfd_dead_cpu(unsigned int cpu)
+{
+	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
 
-	return NOTIFY_OK;
+	free_cpumask_var(cfd->cpumask);
+	free_percpu(cfd->csd);
+	return 0;
 }
 
-static struct notifier_block hotplug_cfd_notifier = {
-	.notifier_call		= hotplug_cfd,
-};
+int smpcfd_dying_cpu(unsigned int cpu)
+{
+	/*
+	 * The IPIs for the smp-call-function callbacks queued by other
+	 * CPUs might arrive late, either due to hardware latencies or
+	 * because this CPU disabled interrupts (inside stop-machine)
+	 * before the IPIs were sent. So flush out any pending callbacks
+	 * explicitly (without waiting for the IPIs to arrive), to
+	 * ensure that the outgoing CPU doesn't go offline with work
+	 * still pending.
+	 */
+	flush_smp_call_function_queue(false);
+	return 0;
+}
 
 void __init call_function_init(void)
 {
-	void *cpu = (void *)(long)smp_processor_id();
 	int i;
 
 	for_each_possible_cpu(i)
 		init_llist_head(&per_cpu(call_single_queue, i));
 
-	hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
-	register_cpu_notifier(&hotplug_cfd_notifier);
+	smpcfd_prepare_cpu(smp_processor_id());
 }
 
 /*
-- 
cgit v1.2.3


From 4df8374254ea9294dfe4b8c447a1b7eddc543dbf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Jul 2016 17:17:03 +0000
Subject: rcu: Convert rcutree to hotplug state machine

Straight forward conversion to the state machine. Though the question arises
whether this needs really all these state transitions to work.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160713153337.982013161@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c      |  14 ++++++++
 kernel/rcu/tree.c | 105 +++++++++++++++++++++++++++---------------------------
 2 files changed, 66 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 008e2fd40cb1..f24f45915b54 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1205,6 +1205,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = NULL,
 		.teardown = timers_dead_cpu,
 	},
+	[CPUHP_RCUTREE_PREP] = {
+		.name = "RCU-tree prepare",
+		.startup = rcutree_prepare_cpu,
+		.teardown = rcutree_dead_cpu,
+	},
 	/*
 	 * Preparatory and dead notifiers. Will be replaced once the notifiers
 	 * are converted to states.
@@ -1263,6 +1268,10 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.startup		= sched_cpu_starting,
 		.teardown		= sched_cpu_dying,
 	},
+	[CPUHP_AP_RCUTREE_DYING] = {
+		.startup = NULL,
+		.teardown = rcutree_dying_cpu,
+	},
 	/*
 	 * Low level startup/teardown notifiers. Run with interrupts
 	 * disabled. Will be removed once the notifiers are converted to
@@ -1296,6 +1305,11 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.startup = workqueue_online_cpu,
 		.teardown = workqueue_offline_cpu,
 	},
+	[CPUHP_AP_RCUTREE_ONLINE] = {
+		.name = "RCU-tree online",
+		.startup = rcutree_online_cpu,
+		.teardown = rcutree_offline_cpu,
+	},
 
 	/*
 	 * Online/down_prepare notifiers. Will be removed once the notifiers
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f433959e9322..5d80925e7fc8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1073,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching);
  * offline to continue to use RCU for one jiffy after marking itself
  * offline in the cpu_online_mask.  This leniency is necessary given the
  * non-atomic nature of the online and offline processing, for example,
- * the fact that a CPU enters the scheduler after completing the CPU_DYING
- * notifiers.
+ * the fact that a CPU enters the scheduler after completing the teardown
+ * of the CPU.
  *
- * This is also why RCU internally marks CPUs online during the
- * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
+ * This is also why RCU internally marks CPUs online during in the
+ * preparation phase and offline after the CPU has been taken down.
  *
  * Disable checking if in an NMI handler because we cannot safely report
  * errors from NMI handlers anyway.
@@ -3806,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
-static void rcu_prepare_cpu(int cpu)
+int rcutree_prepare_cpu(unsigned int cpu)
 {
 	struct rcu_state *rsp;
 
 	for_each_rcu_flavor(rsp)
 		rcu_init_percpu_data(cpu, rsp);
+
+	rcu_prepare_kthreads(cpu);
+	rcu_spawn_all_nocb_kthreads(cpu);
+
+	return 0;
+}
+
+static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
+{
+	struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
+
+	rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
+}
+
+int rcutree_online_cpu(unsigned int cpu)
+{
+	sync_sched_exp_online_cleanup(cpu);
+	rcutree_affinity_setting(cpu, -1);
+	return 0;
+}
+
+int rcutree_offline_cpu(unsigned int cpu)
+{
+	rcutree_affinity_setting(cpu, cpu);
+	return 0;
+}
+
+
+int rcutree_dying_cpu(unsigned int cpu)
+{
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp)
+		rcu_cleanup_dying_cpu(rsp);
+	return 0;
+}
+
+int rcutree_dead_cpu(unsigned int cpu)
+{
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp) {
+		rcu_cleanup_dead_cpu(cpu, rsp);
+		do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
+	}
+	return 0;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -3851,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu)
 }
 #endif
 
-/*
- * Handle CPU online/offline notification events.
- */
-int rcu_cpu_notify(struct notifier_block *self,
-		   unsigned long action, void *hcpu)
-{
-	long cpu = (long)hcpu;
-	struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
-	struct rcu_node *rnp = rdp->mynode;
-	struct rcu_state *rsp;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		rcu_prepare_cpu(cpu);
-		rcu_prepare_kthreads(cpu);
-		rcu_spawn_all_nocb_kthreads(cpu);
-		break;
-	case CPU_ONLINE:
-	case CPU_DOWN_FAILED:
-		sync_sched_exp_online_cleanup(cpu);
-		rcu_boost_kthread_setaffinity(rnp, -1);
-		break;
-	case CPU_DOWN_PREPARE:
-		rcu_boost_kthread_setaffinity(rnp, cpu);
-		break;
-	case CPU_DYING:
-	case CPU_DYING_FROZEN:
-		for_each_rcu_flavor(rsp)
-			rcu_cleanup_dying_cpu(rsp);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		for_each_rcu_flavor(rsp) {
-			rcu_cleanup_dead_cpu(cpu, rsp);
-			do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
-		}
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
 static int rcu_pm_notify(struct notifier_block *self,
 			 unsigned long action, void *hcpu)
 {
@@ -4208,10 +4208,9 @@ void __init rcu_init(void)
 	 * this is called early in boot, before either interrupts
 	 * or the scheduler are operational.
 	 */
-	cpu_notifier(rcu_cpu_notify, 0);
 	pm_notifier(rcu_pm_notify, 0);
 	for_each_online_cpu(cpu)
-		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+		rcutree_prepare_cpu(cpu);
 }
 
 #include "tree_exp.h"
-- 
cgit v1.2.3


From 7bd8830875bfa380c68f390efbad893293749324 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 15 Jul 2016 06:35:24 -0500
Subject: cgroupns: Fix the locking in copy_cgroup_ns

If "clone(CLONE_NEWCGROUP...)" is called it results in a nice lockdep
valid splat.

In __cgroup_proc_write the lock ordering is:
     cgroup_mutex -- through cgroup_kn_lock_live
     cgroup_threadgroup_rwsem

In copy_process the guts of clone the lock ordering is:
     cgroup_threadgroup_rwsem -- through threadgroup_change_begin
     cgroup_mutex -- through copy_namespaces -- copy_cgroup_ns

lockdep reports some a different call chains for the first ordering of
cgroup_mutex and cgroup_threadgroup_rwsem but it is harder to trace.
This is most definitely deadlock potential under the right
circumstances.

Fix this by by skipping the cgroup_mutex and making the locking in
copy_cgroup_ns mirror the locking in cgroup_post_fork which also runs
during fork under the cgroup_threadgroup_rwsem.

Cc: stable@vger.kernel.org
Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75c0ff00aca6..5f01e00cffc4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6309,14 +6309,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
 	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	mutex_lock(&cgroup_mutex);
+	/* It is not safe to take cgroup_mutex here */
 	spin_lock_irq(&css_set_lock);
-
 	cset = task_css_set(current);
 	get_css_set(cset);
-
 	spin_unlock_irq(&css_set_lock);
-	mutex_unlock(&cgroup_mutex);
 
 	new_ns = alloc_cgroup_ns();
 	if (IS_ERR(new_ns)) {
-- 
cgit v1.2.3


From eedd0f4cbf5f3b81e82649832091e1d9d53f0709 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 15 Jul 2016 06:35:51 -0500
Subject: cgroupns: Close race between cgroup_post_fork and copy_cgroup_ns

In most code paths involving cgroup migration cgroup_threadgroup_rwsem
is taken.  There are two exceptions:

- remove_tasks_in_empty_cpuset calls cgroup_transfer_tasks
- vhost_attach_cgroups_work calls cgroup_attach_task_all

With cgroup_threadgroup_rwsem held it is guaranteed that cgroup_post_fork
and copy_cgroup_ns will reference the same css_set from the process calling
fork.

Without such an interlock there process after fork could reference one
css_set from it's new cgroup namespace and another css_set from
task->cgroups, which semantically is nonsensical.

Cc: stable@vger.kernel.org
Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5f01e00cffc4..e75efa819911 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2962,6 +2962,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 	int retval = 0;
 
 	mutex_lock(&cgroup_mutex);
+	percpu_down_write(&cgroup_threadgroup_rwsem);
 	for_each_root(root) {
 		struct cgroup *from_cgrp;
 
@@ -2976,6 +2977,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 		if (retval)
 			break;
 	}
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 	mutex_unlock(&cgroup_mutex);
 
 	return retval;
@@ -4343,6 +4345,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 
 	mutex_lock(&cgroup_mutex);
 
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
 	/* all tasks in @from are being moved, all csets are source */
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &from->cset_links, cset_link)
@@ -4371,6 +4375,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	} while (task && !ret);
 out_err:
 	cgroup_migrate_finish(&preloaded_csets);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
-- 
cgit v1.2.3


From 726a4994b05ff5b6f83d64b5b43c3251217366ce Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 15 Jul 2016 06:36:44 -0500
Subject: cgroupns: Only allow creation of hierarchies in the initial cgroup
 namespace

Unprivileged users can't use hierarchies if they create them as they do not
have privilieges to the root directory.

Which means the only thing a hiearchy created by an unprivileged user
is good for is expanding the number of cgroup links in every css_set,
which is a DOS attack.

We could allow hierarchies to be created in namespaces in the initial
user namespace.  Unfortunately there is only a single namespace for
the names of heirarchies, so that is likely to create more confusion
than not.

So do the simple thing and restrict hiearchy creation to the initial
cgroup namespace.

Cc: stable@vger.kernel.org
Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e75efa819911..e0be49fc382f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2215,12 +2215,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		goto out_unlock;
 	}
 
-	/*
-	 * We know this subsystem has not yet been bound.  Users in a non-init
-	 * user namespace may only mount hierarchies with no bound subsystems,
-	 * i.e. 'none,name=user1'
-	 */
-	if (!opts.none && !capable(CAP_SYS_ADMIN)) {
+	/* Hierarchies may only be created in the initial cgroup namespace. */
+	if (ns != &init_cgroup_ns) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
-- 
cgit v1.2.3


From 406f992e4a372dafbe3c2cff7efbb2002a5c8ebd Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 14 Jul 2016 03:55:23 +0200
Subject: x86 / hibernate: Use hlt_play_dead() when resuming from hibernation

On Intel hardware, native_play_dead() uses mwait_play_dead() by
default and only falls back to the other methods if that fails.
That also happens during resume from hibernation, when the restore
(boot) kernel runs disable_nonboot_cpus() to take all of the CPUs
except for the boot one offline.

However, that is problematic, because the address passed to
__monitor() in mwait_play_dead() is likely to be written to in the
last phase of hibernate image restoration and that causes the "dead"
CPU to start executing instructions again.  Unfortunately, the page
containing the address in that CPU's instruction pointer may not be
valid any more at that point.

First, that page may have been overwritten with image kernel memory
contents already, so the instructions the CPU attempts to execute may
simply be invalid.  Second, the page tables previously used by that
CPU may have been overwritten by image kernel memory contents, so the
address in its instruction pointer is impossible to resolve then.

A report from Varun Koyyalagunta and investigation carried out by
Chen Yu show that the latter sometimes happens in practice.

To prevent it from happening, temporarily change the smp_ops.play_dead
pointer during resume from hibernation so that it points to a special
"play dead" routine which uses hlt_play_dead() and avoids the
inadvertent "revivals" of "dead" CPUs this way.

A slightly unpleasant consequence of this change is that if the
system is hibernated with one or more CPUs offline, it will generally
draw more power after resume than it did before hibernation, because
the physical state entered by CPUs via hlt_play_dead() is higher-power
than the mwait_play_dead() one in the majority of cases.  It is
possible to work around this, but it is unclear how much of a problem
that's going to be in practice, so the workaround will be implemented
later if it turns out to be necessary.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=106371
Reported-by: Varun Koyyalagunta <cpudebug@centtech.com>
Original-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/power/hibernate.c | 7 ++++++-
 kernel/power/power.h     | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 51441d87f0b6..5f3523e18e46 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -409,6 +409,11 @@ int hibernation_snapshot(int platform_mode)
 	goto Close;
 }
 
+int __weak hibernate_resume_nonboot_cpu_disable(void)
+{
+	return disable_nonboot_cpus();
+}
+
 /**
  * resume_target_kernel - Restore system state from a hibernation image.
  * @platform_mode: Whether or not to use the platform driver.
@@ -433,7 +438,7 @@ static int resume_target_kernel(bool platform_mode)
 	if (error)
 		goto Cleanup;
 
-	error = disable_nonboot_cpus();
+	error = hibernate_resume_nonboot_cpu_disable();
 	if (error)
 		goto Enable_cpus;
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 064963e89194..242d8b827dd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 }
 #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
 
+extern int hibernate_resume_nonboot_cpu_disable(void);
+
 /*
  * Keep some memory free so that I/O operations can succeed without paging
  * [Might this be more than 4 MB?]
-- 
cgit v1.2.3


From 7e3f977edd0bd9ea6104156feba95bb5ae9bdd38 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Jul 2016 18:08:03 +0200
Subject: perf, events: add non-linear data support for raw records

This patch adds support for non-linear data on raw records. It
extends raw records to have one or multiple fragments that will
be written linearly into the ring slot, where each fragment can
optionally have a custom callback handler to walk and extract
complex, possibly non-linear data.

If a callback handler is provided for a fragment, then the new
__output_custom() will be used instead of __output_copy() for
the perf_output_sample() part. perf_prepare_sample() does all
the size calculation only once, so perf_output_sample() doesn't
need to redo the same work anymore, meaning real_size and padding
will be cached in the raw record. The raw record becomes 32 bytes
in size without holes; to not increase it further and to avoid
doing unnecessary recalculations in fast-path, we can reuse
next pointer of the last fragment, idea here is borrowed from
ZERO_OR_NULL_PTR(), which should keep the perf_output_sample()
path for PERF_SAMPLE_RAW minimal.

This facility is needed for BPF's event output helper as a first
user that will, in a follow-up, add an additional perf_raw_frag
to its perf_raw_record in order to be able to more efficiently
dump skb context after a linear head meta data related to it.
skbs can be non-linear and thus need a custom output function to
dump buffers. Currently, the skb data needs to be copied twice;
with the help of __output_custom() this work only needs to be
done once. Future users could be things like XDP/BPF programs
that work on different context though and would thus also have
a different callback function.

The few users of raw records are adapted to initialize their frag
data from the raw record itself, no change in behavior for them.
The code is based upon a PoC diff provided by Peter Zijlstra [1].

  [1] http://thread.gmane.org/gmane.linux.network/421294

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/events/core.c     | 66 +++++++++++++++++++++++++++++++++---------------
 kernel/events/internal.h | 16 +++++++++---
 kernel/trace/bpf_trace.c |  6 +++--
 3 files changed, 62 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9c51ec3f0f44..b1891b6b5c1f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5553,16 +5553,26 @@ void perf_output_sample(struct perf_output_handle *handle,
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
-		if (data->raw) {
-			u32 raw_size = data->raw->size;
-			u32 real_size = round_up(raw_size + sizeof(u32),
-						 sizeof(u64)) - sizeof(u32);
-			u64 zero = 0;
-
-			perf_output_put(handle, real_size);
-			__output_copy(handle, data->raw->data, raw_size);
-			if (real_size - raw_size)
-				__output_copy(handle, &zero, real_size - raw_size);
+		struct perf_raw_record *raw = data->raw;
+
+		if (raw) {
+			struct perf_raw_frag *frag = &raw->frag;
+
+			perf_output_put(handle, raw->size);
+			do {
+				if (frag->copy) {
+					__output_custom(handle, frag->copy,
+							frag->data, frag->size);
+				} else {
+					__output_copy(handle, frag->data,
+						      frag->size);
+				}
+				if (perf_raw_frag_last(frag))
+					break;
+				frag = frag->next;
+			} while (1);
+			if (frag->pad)
+				__output_skip(handle, NULL, frag->pad);
 		} else {
 			struct {
 				u32	size;
@@ -5687,14 +5697,28 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
-		int size = sizeof(u32);
-
-		if (data->raw)
-			size += data->raw->size;
-		else
-			size += sizeof(u32);
+		struct perf_raw_record *raw = data->raw;
+		int size;
+
+		if (raw) {
+			struct perf_raw_frag *frag = &raw->frag;
+			u32 sum = 0;
+
+			do {
+				sum += frag->size;
+				if (perf_raw_frag_last(frag))
+					break;
+				frag = frag->next;
+			} while (1);
+
+			size = round_up(sum + sizeof(u32), sizeof(u64));
+			raw->size = size - sizeof(u32);
+			frag->pad = raw->size - sum;
+		} else {
+			size = sizeof(u64);
+		}
 
-		header->size += round_up(size, sizeof(u64));
+		header->size += size;
 	}
 
 	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -7331,7 +7355,7 @@ static struct pmu perf_swevent = {
 static int perf_tp_filter_match(struct perf_event *event,
 				struct perf_sample_data *data)
 {
-	void *record = data->raw->data;
+	void *record = data->raw->frag.data;
 
 	/* only top level events have filters set */
 	if (event->parent)
@@ -7387,8 +7411,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 	struct perf_event *event;
 
 	struct perf_raw_record raw = {
-		.size = entry_size,
-		.data = record,
+		.frag = {
+			.size = entry_size,
+			.data = record,
+		},
 	};
 
 	perf_sample_data_init(&data, 0, 0);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 05f9f6d626df..2417eb5512cd 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,10 +123,7 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
 	return rb->aux_nr_pages << PAGE_SHIFT;
 }
 
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\
-static inline unsigned long						\
-func_name(struct perf_output_handle *handle,				\
-	  const void *buf, unsigned long len)				\
+#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func)				\
 {									\
 	unsigned long size, written;					\
 									\
@@ -152,6 +149,17 @@ func_name(struct perf_output_handle *handle,				\
 	return len;							\
 }
 
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\
+static inline unsigned long						\
+func_name(struct perf_output_handle *handle,				\
+	  const void *buf, unsigned long len)				\
+__DEFINE_OUTPUT_COPY_BODY(memcpy_func)
+
+static inline unsigned long
+__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
+		const void *buf, unsigned long len)
+__DEFINE_OUTPUT_COPY_BODY(copy_func)
+
 static inline unsigned long
 memcpy_common(void *dst, const void *src, unsigned long n)
 {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 094c716154ed..35ab1b2b041b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -245,8 +245,10 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
 	struct perf_raw_record raw = {
-		.size = size,
-		.data = data,
+		.frag = {
+			.size = size,
+			.data = data,
+		},
 	};
 
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
-- 
cgit v1.2.3


From 8e7a3920ac277dd4e690c0e70c9750176e3acb83 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Jul 2016 18:08:04 +0200
Subject: bpf, perf: split bpf_perf_event_output

Split the bpf_perf_event_output() helper as a preparation into
two parts. The new bpf_perf_event_output() will prepare the raw
record itself and test for unknown flags from BPF trace context,
where the __bpf_perf_event_output() does the core work. The
latter will be reused later on from bpf_event_output() directly.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 35ab1b2b041b..c35883a9bc11 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -233,26 +233,17 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+static __always_inline u64
+__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
+			u64 flags, struct perf_raw_record *raw)
 {
-	struct pt_regs *regs = (struct pt_regs *) (long) r1;
-	struct bpf_map *map = (struct bpf_map *) (long) r2;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
-	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
-	struct perf_raw_record raw = {
-		.frag = {
-			.size = size,
-			.data = data,
-		},
-	};
 
-	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
-		return -EINVAL;
 	if (index == BPF_F_CURRENT_CPU)
 		index = cpu;
 	if (unlikely(index >= array->map.max_entries))
@@ -271,11 +262,29 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 		return -EOPNOTSUPP;
 
 	perf_sample_data_init(&sample_data, 0, 0);
-	sample_data.raw = &raw;
+	sample_data.raw = raw;
 	perf_event_output(event, &sample_data, regs);
 	return 0;
 }
 
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+	struct pt_regs *regs = (struct pt_regs *)(long) r1;
+	struct bpf_map *map  = (struct bpf_map *)(long) r2;
+	void *data = (void *)(long) r4;
+	struct perf_raw_record raw = {
+		.frag = {
+			.size = size,
+			.data = data,
+		},
+	};
+
+	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+		return -EINVAL;
+
+	return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.func		= bpf_perf_event_output,
 	.gpl_only	= true,
-- 
cgit v1.2.3


From 555c8a8623a3a87b3c990ba30b7fd2e5914e41d2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Jul 2016 18:08:05 +0200
Subject: bpf: avoid stack copy and use skb ctx for event output

This work addresses a couple of issues bpf_skb_event_output()
helper currently has: i) We need two copies instead of just a
single one for the skb data when it should be part of a sample.
The data can be non-linear and thus needs to be extracted via
bpf_skb_load_bytes() helper first, and then copied once again
into the ring buffer slot. ii) Since bpf_skb_load_bytes()
currently needs to be used first, the helper needs to see a
constant size on the passed stack buffer to make sure BPF
verifier can do sanity checks on it during verification time.
Thus, just passing skb->len (or any other non-constant value)
wouldn't work, but changing bpf_skb_load_bytes() is also not
the proper solution, since the two copies are generally still
needed. iii) bpf_skb_load_bytes() is just for rather small
buffers like headers, since they need to sit on the limited
BPF stack anyway. Instead of working around in bpf_skb_load_bytes(),
this work improves the bpf_skb_event_output() helper to address
all 3 at once.

We can make use of the passed in skb context that we have in
the helper anyway, and use some of the reserved flag bits as
a length argument. The helper will use the new __output_custom()
facility from perf side with bpf_skb_copy() as callback helper
to walk and extract the data. It will pass the data for setup
to bpf_event_output(), which generates and pushes the raw record
with an additional frag part. The linear data used in the first
frag of the record serves as programmatically defined meta data
passed along with the appended sample.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c        |  6 ++++--
 kernel/trace/bpf_trace.c | 33 +++++++++++++++------------------
 2 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d638062f66d6..03fd23d4d587 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1054,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 	return NULL;
 }
 
-const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+u64 __weak
+bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+		 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
-	return NULL;
+	return -ENOTSUPP;
 }
 
 /* Always built-in helper functions. */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c35883a9bc11..ebfbb7dd7033 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -298,29 +298,26 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
 
-static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
 	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+	struct perf_raw_frag frag = {
+		.copy		= ctx_copy,
+		.size		= ctx_size,
+		.data		= ctx,
+	};
+	struct perf_raw_record raw = {
+		.frag = {
+			.next	= ctx_size ? &frag : NULL,
+			.size	= meta_size,
+			.data	= meta,
+		},
+	};
 
 	perf_fetch_caller_regs(regs);
 
-	return bpf_perf_event_output((long)regs, r2, flags, r4, size);
-}
-
-static const struct bpf_func_proto bpf_event_output_proto = {
-	.func		= bpf_event_output,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_CONST_MAP_PTR,
-	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
-};
-
-const struct bpf_func_proto *bpf_get_event_output_proto(void)
-{
-	return &bpf_event_output_proto;
+	return __bpf_perf_event_output(regs, map, flags, &raw);
 }
 
 static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-- 
cgit v1.2.3


From 858d68f10238fdd1ebdd0096f912f063e97c6766 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 16 Jul 2016 01:15:55 +0200
Subject: bpf: bpf_event_entry_gen's alloc needs to be in atomic context

Should have been obvious, only called from bpf() syscall via map_update_elem()
that calls bpf_fd_array_map_update_elem() under RCU read lock and thus this
must also be in GFP_ATOMIC, of course.

Fixes: 3b1efb196eee ("bpf, maps: flush own entries on perf map release")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index db1a743e3db2..633a650d7aeb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -430,7 +430,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
 {
 	struct bpf_event_entry *ee;
 
-	ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+	ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
 	if (ee) {
 		ee->event = perf_file->private_data;
 		ee->perf_file = perf_file;
-- 
cgit v1.2.3


From 55094f57535831883b60776de5eb78c6bfb3c16d Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Tue, 19 Jul 2016 12:02:39 +0000
Subject: cgroup: remove duplicated include from cgroup.c

Remove duplicated include.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dd26e1bb7222..861995c7fc3f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -61,7 +61,6 @@
 #include <linux/cpuset.h>
 #include <linux/proc_ns.h>
 #include <linux/nsproxy.h>
-#include <linux/proc_ns.h>
 #include <net/sock.h>
 
 /*
-- 
cgit v1.2.3


From 183fc1537ec39be242dc8b619f71fc11b393d295 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 18 Jul 2016 15:50:58 -0700
Subject: kernel/trace/bpf_trace.c: work around gcc-4.4.4 anon union
 initialization bug

kernel/trace/bpf_trace.c: In function 'bpf_event_output':
kernel/trace/bpf_trace.c:312: error: unknown field 'next' specified in initializer
kernel/trace/bpf_trace.c:312: warning: missing braces around initializer
kernel/trace/bpf_trace.c:312: warning: (near initialization for 'raw.frag.<anonymous>')

Fixes: 555c8a8623a3a87 ("bpf: avoid stack copy and use skb ctx for event output")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ebfbb7dd7033..a12bbd32c0a6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -309,7 +309,9 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	};
 	struct perf_raw_record raw = {
 		.frag = {
-			.next	= ctx_size ? &frag : NULL,
+			{
+				.next	= ctx_size ? &frag : NULL,
+			},
 			.size	= meta_size,
 			.data	= meta,
 		},
-- 
cgit v1.2.3


From 59d3656d5bf504f771fc44fdbc7a9a8590795f22 Mon Sep 17 00:00:00 2001
From: Brenden Blanco <bblanco@plumgrid.com>
Date: Tue, 19 Jul 2016 12:16:46 -0700
Subject: bpf: add bpf_prog_add api for bulk prog refcnt

A subsystem may need to store many copies of a bpf program, each
deserving its own reference. Rather than requiring the caller to loop
one by one (with possible mid-loop failure), add a bulk bpf_prog_add
api.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 96d938a22050..228f962447a5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -670,14 +670,20 @@ static struct bpf_prog *____bpf_prog_get(struct fd f)
 	return f.file->private_data;
 }
 
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
 {
-	if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
-		atomic_dec(&prog->aux->refcnt);
+	if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
+		atomic_sub(i, &prog->aux->refcnt);
 		return ERR_PTR(-EBUSY);
 	}
 	return prog;
 }
+EXPORT_SYMBOL_GPL(bpf_prog_add);
+
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+	return bpf_prog_add(prog, 1);
+}
 
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
-- 
cgit v1.2.3


From 6a773a15a1e8874e5eccd2f29190c31085912c95 Mon Sep 17 00:00:00 2001
From: Brenden Blanco <bblanco@plumgrid.com>
Date: Tue, 19 Jul 2016 12:16:47 -0700
Subject: bpf: add XDP prog type for early driver filter

Add a new bpf prog type that is intended to run in early stages of the
packet rx path. Only minimal packet metadata will be available, hence a
new context type, struct xdp_md, is exposed to userspace. So far only
expose the packet start and end pointers, and only in read mode.

An XDP program must return one of the well known enum values, all other
return codes are reserved for future use. Unfortunately, this
restriction is hard to enforce at verification time, so take the
approach of warning at runtime when such programs are encountered. Out
of bounds return codes should alias to XDP_ABORTED.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e206c2181412..a8d67d097b0d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -713,6 +713,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_XDP:
 		break;
 	default:
 		verbose("verifier is misconfigured\n");
-- 
cgit v1.2.3


From 4acf6c0b84c91243c705303cd9ff16421914150d Mon Sep 17 00:00:00 2001
From: Brenden Blanco <bblanco@plumgrid.com>
Date: Tue, 19 Jul 2016 12:16:56 -0700
Subject: bpf: enable direct packet data write for xdp progs

For forwarding to be effective, XDP programs should be allowed to
rewrite packet data.

This requires that the drivers supporting XDP must all map the packet
memory as TODEVICE or BIDIRECTIONAL before invoking the program.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8d67d097b0d..f72f23b8fdab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -653,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 
 #define MAX_PACKET_OFF 0xffff
 
+static bool may_write_pkt_data(enum bpf_prog_type type)
+{
+	switch (type) {
+	case BPF_PROG_TYPE_XDP:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 			       int size)
 {
@@ -806,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE) {
+		if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
+		if (t == BPF_WRITE && value_regno >= 0 &&
+		    is_pointer_value(env, value_regno)) {
+			verbose("R%d leaks addr into packet\n", value_regno);
+			return -EACCES;
+		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
-- 
cgit v1.2.3


From 43761473c254b45883a64441dd0bc85a42f3645c Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 19 Jul 2016 17:42:57 -0400
Subject: audit: fix a double fetch in audit_log_single_execve_arg()

There is a double fetch problem in audit_log_single_execve_arg()
where we first check the execve(2) argumnets for any "bad" characters
which would require hex encoding and then re-fetch the arguments for
logging in the audit record[1].  Of course this leaves a window of
opportunity for an unsavory application to munge with the data.

This patch reworks things by only fetching the argument data once[2]
into a buffer where it is scanned and logged into the audit
records(s).  In addition to fixing the double fetch, this patch
improves on the original code in a few other ways: better handling
of large arguments which require encoding, stricter record length
checking, and some performance improvements (completely unverified,
but we got rid of some strlen() calls, that's got to be a good
thing).

As part of the development of this patch, I've also created a basic
regression test for the audit-testsuite, the test can be tracked on
GitHub at the following link:

 * https://github.com/linux-audit/audit-testsuite/issues/25

[1] If you pay careful attention, there is actually a triple fetch
problem due to a strnlen_user() call at the top of the function.

[2] This is a tiny white lie, we do make a call to strnlen_user()
prior to fetching the argument data.  I don't like it, but due to the
way the audit record is structured we really have no choice unless we
copy the entire argument at once (which would require a rather
wasteful allocation).  The good news is that with this patch the
kernel no longer relies on this strnlen_user() value for anything
beyond recording it in the log, we also update it with a trustworthy
value whenever possible.

Reported-by: Pengfei Wang <wpengfeinudt@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 332 +++++++++++++++++++++++++++----------------------------
 1 file changed, 164 insertions(+), 168 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index aa3feec4df14..c65af21a12d6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -73,6 +73,7 @@
 #include <linux/compat.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
+#include <linux/uaccess.h>
 #include <uapi/linux/limits.h>
 
 #include "audit.h"
@@ -82,7 +83,8 @@
 #define AUDITSC_SUCCESS 1
 #define AUDITSC_FAILURE 2
 
-/* no execve audit message should be longer than this (userspace limits) */
+/* no execve audit message should be longer than this (userspace limits),
+ * see the note near the top of audit_log_execve_info() about this value */
 #define MAX_EXECVE_AUDIT_LEN 7500
 
 /* max length to print of cmdline/proctitle value during audit */
@@ -992,184 +994,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 	return rc;
 }
 
-/*
- * to_send and len_sent accounting are very loose estimates.  We aren't
- * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundary)
- *
- * why snprintf?  an int is up to 12 digits long.  if we just assumed when
- * logging that a[%d]= was going to be 16 characters long we would be wasting
- * space in every audit message.  In one 7500 byte message we can log up to
- * about 1000 min size arguments.  That comes down to about 50% waste of space
- * if we didn't do the snprintf to find out how long arg_num_len was.
- */
-static int audit_log_single_execve_arg(struct audit_context *context,
-					struct audit_buffer **ab,
-					int arg_num,
-					size_t *len_sent,
-					const char __user *p,
-					char *buf)
+static void audit_log_execve_info(struct audit_context *context,
+				  struct audit_buffer **ab)
 {
-	char arg_num_len_buf[12];
-	const char __user *tmp_p = p;
-	/* how many digits are in arg_num? 5 is the length of ' a=""' */
-	size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
-	size_t len, len_left, to_send;
-	size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
-	unsigned int i, has_cntl = 0, too_long = 0;
-	int ret;
-
-	/* strnlen_user includes the null we don't want to send */
-	len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
-
-	/*
-	 * We just created this mm, if we can't find the strings
-	 * we just copied into it something is _very_ wrong. Similar
-	 * for strings that are too long, we should not have created
-	 * any.
-	 */
-	if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
-		send_sig(SIGKILL, current, 0);
-		return -1;
+	long len_max;
+	long len_rem;
+	long len_full;
+	long len_buf;
+	long len_abuf;
+	long len_tmp;
+	bool require_data;
+	bool encode;
+	unsigned int iter;
+	unsigned int arg;
+	char *buf_head;
+	char *buf;
+	const char __user *p = (const char __user *)current->mm->arg_start;
+
+	/* NOTE: this buffer needs to be large enough to hold all the non-arg
+	 *       data we put in the audit record for this argument (see the
+	 *       code below) ... at this point in time 96 is plenty */
+	char abuf[96];
+
+	/* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the
+	 *       current value of 7500 is not as important as the fact that it
+	 *       is less than 8k, a setting of 7500 gives us plenty of wiggle
+	 *       room if we go over a little bit in the logging below */
+	WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500);
+	len_max = MAX_EXECVE_AUDIT_LEN;
+
+	/* scratch buffer to hold the userspace args */
+	buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
+	if (!buf_head) {
+		audit_panic("out of memory for argv string");
+		return;
 	}
+	buf = buf_head;
 
-	/* walk the whole argument looking for non-ascii chars */
+	audit_log_format(*ab, "argc=%d", context->execve.argc);
+
+	len_rem = len_max;
+	len_buf = 0;
+	len_full = 0;
+	require_data = true;
+	encode = false;
+	iter = 0;
+	arg = 0;
 	do {
-		if (len_left > MAX_EXECVE_AUDIT_LEN)
-			to_send = MAX_EXECVE_AUDIT_LEN;
-		else
-			to_send = len_left;
-		ret = copy_from_user(buf, tmp_p, to_send);
-		/*
-		 * There is no reason for this copy to be short. We just
-		 * copied them here, and the mm hasn't been exposed to user-
-		 * space yet.
-		 */
-		if (ret) {
-			WARN_ON(1);
-			send_sig(SIGKILL, current, 0);
-			return -1;
-		}
-		buf[to_send] = '\0';
-		has_cntl = audit_string_contains_control(buf, to_send);
-		if (has_cntl) {
-			/*
-			 * hex messages get logged as 2 bytes, so we can only
-			 * send half as much in each message
-			 */
-			max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
-			break;
-		}
-		len_left -= to_send;
-		tmp_p += to_send;
-	} while (len_left > 0);
-
-	len_left = len;
-
-	if (len > max_execve_audit_len)
-		too_long = 1;
-
-	/* rewalk the argument actually logging the message */
-	for (i = 0; len_left > 0; i++) {
-		int room_left;
-
-		if (len_left > max_execve_audit_len)
-			to_send = max_execve_audit_len;
-		else
-			to_send = len_left;
-
-		/* do we have space left to send this argument in this ab? */
-		room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
-		if (has_cntl)
-			room_left -= (to_send * 2);
-		else
-			room_left -= to_send;
-		if (room_left < 0) {
-			*len_sent = 0;
-			audit_log_end(*ab);
-			*ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
-			if (!*ab)
-				return 0;
-		}
+		/* NOTE: we don't ever want to trust this value for anything
+		 *       serious, but the audit record format insists we
+		 *       provide an argument length for really long arguments,
+		 *       e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but
+		 *       to use strncpy_from_user() to obtain this value for
+		 *       recording in the log, although we don't use it
+		 *       anywhere here to avoid a double-fetch problem */
+		if (len_full == 0)
+			len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1;
+
+		/* read more data from userspace */
+		if (require_data) {
+			/* can we make more room in the buffer? */
+			if (buf != buf_head) {
+				memmove(buf_head, buf, len_buf);
+				buf = buf_head;
+			}
+
+			/* fetch as much as we can of the argument */
+			len_tmp = strncpy_from_user(&buf_head[len_buf], p,
+						    len_max - len_buf);
+			if (len_tmp == -EFAULT) {
+				/* unable to copy from userspace */
+				send_sig(SIGKILL, current, 0);
+				goto out;
+			} else if (len_tmp == (len_max - len_buf)) {
+				/* buffer is not large enough */
+				require_data = true;
+				/* NOTE: if we are going to span multiple
+				 *       buffers force the encoding so we stand
+				 *       a chance at a sane len_full value and
+				 *       consistent record encoding */
+				encode = true;
+				len_full = len_full * 2;
+				p += len_tmp;
+			} else {
+				require_data = false;
+				if (!encode)
+					encode = audit_string_contains_control(
+								buf, len_tmp);
+				/* try to use a trusted value for len_full */
+				if (len_full < len_max)
+					len_full = (encode ?
+						    len_tmp * 2 : len_tmp);
+				p += len_tmp + 1;
+			}
+			len_buf += len_tmp;
+			buf_head[len_buf] = '\0';
 
-		/*
-		 * first record needs to say how long the original string was
-		 * so we can be sure nothing was lost.
-		 */
-		if ((i == 0) && (too_long))
-			audit_log_format(*ab, " a%d_len=%zu", arg_num,
-					 has_cntl ? 2*len : len);
-
-		/*
-		 * normally arguments are small enough to fit and we already
-		 * filled buf above when we checked for control characters
-		 * so don't bother with another copy_from_user
-		 */
-		if (len >= max_execve_audit_len)
-			ret = copy_from_user(buf, p, to_send);
-		else
-			ret = 0;
-		if (ret) {
-			WARN_ON(1);
-			send_sig(SIGKILL, current, 0);
-			return -1;
+			/* length of the buffer in the audit record? */
+			len_abuf = (encode ? len_buf * 2 : len_buf + 2);
 		}
-		buf[to_send] = '\0';
-
-		/* actually log it */
-		audit_log_format(*ab, " a%d", arg_num);
-		if (too_long)
-			audit_log_format(*ab, "[%d]", i);
-		audit_log_format(*ab, "=");
-		if (has_cntl)
-			audit_log_n_hex(*ab, buf, to_send);
-		else
-			audit_log_string(*ab, buf);
-
-		p += to_send;
-		len_left -= to_send;
-		*len_sent += arg_num_len;
-		if (has_cntl)
-			*len_sent += to_send * 2;
-		else
-			*len_sent += to_send;
-	}
-	/* include the null we didn't log */
-	return len + 1;
-}
 
-static void audit_log_execve_info(struct audit_context *context,
-				  struct audit_buffer **ab)
-{
-	int i, len;
-	size_t len_sent = 0;
-	const char __user *p;
-	char *buf;
+		/* write as much as we can to the audit log */
+		if (len_buf > 0) {
+			/* NOTE: some magic numbers here - basically if we
+			 *       can't fit a reasonable amount of data into the
+			 *       existing audit buffer, flush it and start with
+			 *       a new buffer */
+			if ((sizeof(abuf) + 8) > len_rem) {
+				len_rem = len_max;
+				audit_log_end(*ab);
+				*ab = audit_log_start(context,
+						      GFP_KERNEL, AUDIT_EXECVE);
+				if (!*ab)
+					goto out;
+			}
 
-	p = (const char __user *)current->mm->arg_start;
+			/* create the non-arg portion of the arg record */
+			len_tmp = 0;
+			if (require_data || (iter > 0) ||
+			    ((len_abuf + sizeof(abuf)) > len_rem)) {
+				if (iter == 0) {
+					len_tmp += snprintf(&abuf[len_tmp],
+							sizeof(abuf) - len_tmp,
+							" a%d_len=%lu",
+							arg, len_full);
+				}
+				len_tmp += snprintf(&abuf[len_tmp],
+						    sizeof(abuf) - len_tmp,
+						    " a%d[%d]=", arg, iter++);
+			} else
+				len_tmp += snprintf(&abuf[len_tmp],
+						    sizeof(abuf) - len_tmp,
+						    " a%d=", arg);
+			WARN_ON(len_tmp >= sizeof(abuf));
+			abuf[sizeof(abuf) - 1] = '\0';
+
+			/* log the arg in the audit record */
+			audit_log_format(*ab, "%s", abuf);
+			len_rem -= len_tmp;
+			len_tmp = len_buf;
+			if (encode) {
+				if (len_abuf > len_rem)
+					len_tmp = len_rem / 2; /* encoding */
+				audit_log_n_hex(*ab, buf, len_tmp);
+				len_rem -= len_tmp * 2;
+				len_abuf -= len_tmp * 2;
+			} else {
+				if (len_abuf > len_rem)
+					len_tmp = len_rem - 2; /* quotes */
+				audit_log_n_string(*ab, buf, len_tmp);
+				len_rem -= len_tmp + 2;
+				/* don't subtract the "2" because we still need
+				 * to add quotes to the remaining string */
+				len_abuf -= len_tmp;
+			}
+			len_buf -= len_tmp;
+			buf += len_tmp;
+		}
 
-	audit_log_format(*ab, "argc=%d", context->execve.argc);
+		/* ready to move to the next argument? */
+		if ((len_buf == 0) && !require_data) {
+			arg++;
+			iter = 0;
+			len_full = 0;
+			require_data = true;
+			encode = false;
+		}
+	} while (arg < context->execve.argc);
 
-	/*
-	 * we need some kernel buffer to hold the userspace args.  Just
-	 * allocate one big one rather than allocating one of the right size
-	 * for every single argument inside audit_log_single_execve_arg()
-	 * should be <8k allocation so should be pretty safe.
-	 */
-	buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
-	if (!buf) {
-		audit_panic("out of memory for argv string");
-		return;
-	}
+	/* NOTE: the caller handles the final audit_log_end() call */
 
-	for (i = 0; i < context->execve.argc; i++) {
-		len = audit_log_single_execve_arg(context, ab, i,
-						  &len_sent, p, buf);
-		if (len <= 0)
-			break;
-		p += len;
-	}
-	kfree(buf);
+out:
+	kfree(buf_head);
 }
 
 static void show_special(struct audit_context *context, int *call_panic)
-- 
cgit v1.2.3


From 5cbea46984d67f614c74c4401b54b9d681861e80 Mon Sep 17 00:00:00 2001
From: Steve Muckle <steve.muckle@linaro.org>
Date: Wed, 13 Jul 2016 13:25:26 -0700
Subject: cpufreq: schedutil: map raw required frequency to driver frequency

The slow-path frequency transition path is relatively expensive as it
requires waking up a thread to do work. Should support be added for
remote CPU cpufreq updates that is also expensive since it requires an
IPI. These activities should be avoided if they are not necessary.

To that end, calculate the actual driver-supported frequency required by
the new utilization value in schedutil by using the recently added
cpufreq_driver_resolve_freq API. If it is the same as the previously
requested driver frequency then there is no need to continue with the
update assuming the cpu frequency limits have not changed. This will
have additional benefits should the semantics of the rate limit be
changed to apply solely to frequency transitions rather than to
frequency calculations in schedutil.

The last raw required frequency is cached. This allows the driver
frequency lookup to be skipped in the event that the new raw required
frequency matches the last one, assuming a frequency update has not been
forced due to limits changing (indicated by a next_freq value of
UINT_MAX, see sugov_should_update_freq).

Signed-off-by: Steve Muckle <smuckle@linaro.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 758efd7f3abe..a84641b222c1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,6 +47,8 @@ struct sugov_cpu {
 	struct update_util_data update_util;
 	struct sugov_policy *sg_policy;
 
+	unsigned int cached_raw_freq;
+
 	/* The fields below are only needed when sharing a policy. */
 	unsigned long util;
 	unsigned long max;
@@ -106,7 +108,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
 
 /**
  * get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @policy: cpufreq policy object to compute the new frequency for.
+ * @sg_cpu: schedutil cpu object to compute the new frequency for.
  * @util: Current CPU utilization.
  * @max: CPU capacity.
  *
@@ -121,14 +123,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
  * next_freq = C * curr_freq * util_raw / max
  *
  * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
  */
-static unsigned int get_next_freq(struct cpufreq_policy *policy,
-				  unsigned long util, unsigned long max)
+static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
+				  unsigned long max)
 {
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+	struct cpufreq_policy *policy = sg_policy->policy;
 	unsigned int freq = arch_scale_freq_invariant() ?
 				policy->cpuinfo.max_freq : policy->cur;
 
-	return (freq + (freq >> 2)) * util / max;
+	freq = (freq + (freq >> 2)) * util / max;
+
+	if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+		return sg_policy->next_freq;
+	sg_cpu->cached_raw_freq = freq;
+	return cpufreq_driver_resolve_freq(policy, freq);
 }
 
 static void sugov_update_single(struct update_util_data *hook, u64 time,
@@ -143,13 +156,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 		return;
 
 	next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
-			get_next_freq(policy, util, max);
+			get_next_freq(sg_cpu, util, max);
 	sugov_update_commit(sg_policy, time, next_f);
 }
 
-static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
 					   unsigned long util, unsigned long max)
 {
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	struct cpufreq_policy *policy = sg_policy->policy;
 	unsigned int max_f = policy->cpuinfo.max_freq;
 	u64 last_freq_update_time = sg_policy->last_freq_update_time;
@@ -189,7 +203,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
 		}
 	}
 
-	return get_next_freq(policy, util, max);
+	return get_next_freq(sg_cpu, util, max);
 }
 
 static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -206,7 +220,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
 	sg_cpu->last_update = time;
 
 	if (sugov_should_update_freq(sg_policy, time)) {
-		next_f = sugov_next_freq_shared(sg_policy, util, max);
+		next_f = sugov_next_freq_shared(sg_cpu, util, max);
 		sugov_update_commit(sg_policy, time, next_f);
 	}
 
@@ -433,6 +447,7 @@ static int sugov_start(struct cpufreq_policy *policy)
 			sg_cpu->util = ULONG_MAX;
 			sg_cpu->max = 0;
 			sg_cpu->last_update = 0;
+			sg_cpu->cached_raw_freq = 0;
 			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
 						     sugov_update_shared);
 		} else {
-- 
cgit v1.2.3


From fe12c00d21bb4985fa8da282942250be21e7dd59 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Fri, 22 Jul 2016 10:30:47 +0800
Subject: PM / hibernate: Introduce test_resume mode for hibernation

test_resume mode is to verify if the snapshot data
written to swap device can be successfully restored
to memory. It is useful to ease the debugging process
on hibernation, since this mode can not only bypass
the BIOSes/bootloader, but also the system re-initialization.

To avoid the risk to break the filesystm on persistent storage,
this patch resumes the image with tasks frozen.

For example:
echo test_resume > /sys/power/disk
echo disk > /sys/power/state

[  187.306470] PM: Image saving progress:  70%
[  187.395298] PM: Image saving progress:  80%
[  187.476697] PM: Image saving progress:  90%
[  187.554641] PM: Image saving done.
[  187.558896] PM: Wrote 594600 kbytes in 0.90 seconds (660.66 MB/s)
[  187.566000] PM: S|
[  187.589742] PM: Basic memory bitmaps freed
[  187.594694] PM: Checking hibernation image
[  187.599865] PM: Image signature found, resuming
[  187.605209] PM: Loading hibernation image.
[  187.665753] PM: Basic memory bitmaps created
[  187.691397] PM: Using 3 thread(s) for decompression.
[  187.691397] PM: Loading and decompressing image data (148650 pages)...
[  187.889719] PM: Image loading progress:   0%
[  188.100452] PM: Image loading progress:  10%
[  188.244781] PM: Image loading progress:  20%
[  189.057305] PM: Image loading done.
[  189.068793] PM: Image successfully loaded

Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 65 ++++++++++++++++++++++++++++++++----------------
 kernel/power/swap.c      |  6 +++++
 2 files changed, 50 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 5f3523e18e46..0ee1df0a0bd6 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -52,6 +52,7 @@ enum {
 #ifdef CONFIG_SUSPEND
 	HIBERNATION_SUSPEND,
 #endif
+	HIBERNATION_TEST_RESUME,
 	/* keep last */
 	__HIBERNATION_AFTER_LAST
 };
@@ -647,12 +648,39 @@ static void power_down(void)
 		cpu_relax();
 }
 
+static int load_image_and_restore(void)
+{
+	int error;
+	unsigned int flags;
+
+	pr_debug("PM: Loading hibernation image.\n");
+
+	lock_device_hotplug();
+	error = create_basic_memory_bitmaps();
+	if (error)
+		goto Unlock;
+
+	error = swsusp_read(&flags);
+	swsusp_close(FMODE_READ);
+	if (!error)
+		hibernation_restore(flags & SF_PLATFORM_MODE);
+
+	printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
+	swsusp_free();
+	free_basic_memory_bitmaps();
+ Unlock:
+	unlock_device_hotplug();
+
+	return error;
+}
+
 /**
  * hibernate - Carry out system hibernation, including saving the image.
  */
 int hibernate(void)
 {
 	int error, nr_calls = 0;
+	bool snapshot_test = false;
 
 	if (!hibernation_available()) {
 		pr_debug("PM: Hibernation not available.\n");
@@ -704,8 +732,12 @@ int hibernate(void)
 		pr_debug("PM: writing image.\n");
 		error = swsusp_write(flags);
 		swsusp_free();
-		if (!error)
-			power_down();
+		if (!error) {
+			if (hibernation_mode == HIBERNATION_TEST_RESUME)
+				snapshot_test = true;
+			else
+				power_down();
+		}
 		in_suspend = 0;
 		pm_restore_gfp_mask();
 	} else {
@@ -716,6 +748,12 @@ int hibernate(void)
 	free_basic_memory_bitmaps();
  Thaw:
 	unlock_device_hotplug();
+	if (snapshot_test) {
+		pr_debug("PM: Checking hibernation image\n");
+		error = swsusp_check();
+		if (!error)
+			error = load_image_and_restore();
+	}
 	thaw_processes();
 
 	/* Don't bother checking whether freezer_test_done is true */
@@ -748,7 +786,6 @@ int hibernate(void)
 static int software_resume(void)
 {
 	int error, nr_calls = 0;
-	unsigned int flags;
 
 	/*
 	 * If the user said "noresume".. bail out early.
@@ -844,24 +881,7 @@ static int software_resume(void)
 	error = freeze_processes();
 	if (error)
 		goto Close_Finish;
-
-	pr_debug("PM: Loading hibernation image.\n");
-
-	lock_device_hotplug();
-	error = create_basic_memory_bitmaps();
-	if (error)
-		goto Thaw;
-
-	error = swsusp_read(&flags);
-	swsusp_close(FMODE_READ);
-	if (!error)
-		hibernation_restore(flags & SF_PLATFORM_MODE);
-
-	printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
-	swsusp_free();
-	free_basic_memory_bitmaps();
- Thaw:
-	unlock_device_hotplug();
+	error = load_image_and_restore();
 	thaw_processes();
  Finish:
 	__pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
@@ -887,6 +907,7 @@ static const char * const hibernation_modes[] = {
 #ifdef CONFIG_SUSPEND
 	[HIBERNATION_SUSPEND]	= "suspend",
 #endif
+	[HIBERNATION_TEST_RESUME]	= "test_resume",
 };
 
 /*
@@ -933,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
 #ifdef CONFIG_SUSPEND
 		case HIBERNATION_SUSPEND:
 #endif
+		case HIBERNATION_TEST_RESUME:
 			break;
 		case HIBERNATION_PLATFORM:
 			if (hibernation_ops)
@@ -979,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
 #ifdef CONFIG_SUSPEND
 		case HIBERNATION_SUSPEND:
 #endif
+		case HIBERNATION_TEST_RESUME:
 			hibernation_mode = mode;
 			break;
 		case HIBERNATION_PLATFORM:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 160e1006640d..51cef8432154 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -348,6 +348,12 @@ static int swsusp_swap_check(void)
 	if (res < 0)
 		blkdev_put(hib_resume_bdev, FMODE_WRITE);
 
+	/*
+	 * Update the resume device to the one actually used,
+	 * so the test_resume mode can use it in case it is
+	 * invoked from hibernate() to test the snapshot.
+	 */
+	swsusp_resume_device = hib_resume_bdev->bd_dev;
 	return res;
 }
 
-- 
cgit v1.2.3


From aa7145c16d6bf086538ad7eb20c807513bfa5efc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 22 Jul 2016 01:19:42 +0200
Subject: bpf, events: fix offset in skb copy handler

This patch fixes the __output_custom() routine we currently use with
bpf_skb_copy(). I missed that when len is larger than the size of the
current handle, we can issue multiple invocations of copy_func, and
__output_custom() advances destination but also source buffer by the
written amount of bytes. When we have __output_custom(), this is actually
wrong since in that case the source buffer points to a non-linear object,
in our case an skb, which the copy_func helper is supposed to walk.
Therefore, since this is non-linear we thus need to pass the offset into
the helper, so that copy_func can use it for extracting the data from
the source object.

Therefore, adjust the callback signatures properly and pass offset
into the skb_header_pointer() invoked from bpf_skb_copy() callback. The
__DEFINE_OUTPUT_COPY_BODY() is adjusted to accommodate for two things:
i) to pass in whether we should advance source buffer or not; this is
a compile-time constant condition, ii) to pass in the offset for
__output_custom(), which we do with help of __VA_ARGS__, so everything
can stay inlined as is currently. Both changes allow for adapting the
__output_* fast-path helpers w/o extra overhead.

Fixes: 555c8a8623a3 ("bpf: avoid stack copy and use skb ctx for event output")
Fixes: 7e3f977edd0b ("perf, events: add non-linear data support for raw records")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/events/internal.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2417eb5512cd..486fd78eb8d5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,18 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
 	return rb->aux_nr_pages << PAGE_SHIFT;
 }
 
-#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func)				\
+#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...)	\
 {									\
 	unsigned long size, written;					\
 									\
 	do {								\
 		size    = min(handle->size, len);			\
-		written = memcpy_func(handle->addr, buf, size);		\
+		written = memcpy_func(__VA_ARGS__);			\
 		written = size - written;				\
 									\
 		len -= written;						\
 		handle->addr += written;				\
-		buf += written;						\
+		if (advance_buf)					\
+			buf += written;					\
 		handle->size -= written;				\
 		if (!handle->size) {					\
 			struct ring_buffer *rb = handle->rb;		\
@@ -153,12 +154,16 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
 static inline unsigned long						\
 func_name(struct perf_output_handle *handle,				\
 	  const void *buf, unsigned long len)				\
-__DEFINE_OUTPUT_COPY_BODY(memcpy_func)
+__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)
 
 static inline unsigned long
 __output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
 		const void *buf, unsigned long len)
-__DEFINE_OUTPUT_COPY_BODY(copy_func)
+{
+	unsigned long orig_len = len;
+	__DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
+				  orig_len - len, size)
+}
 
 static inline unsigned long
 memcpy_common(void *dst, const void *src, unsigned long n)
-- 
cgit v1.2.3


From 96ae52279594470622ff0585621a13e96b700600 Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Mon, 25 Jul 2016 05:54:46 -0700
Subject: bpf: Add bpf_probe_write_user BPF helper to be called in tracers

This allows user memory to be written to during the course of a kprobe.
It shouldn't be used to implement any kind of security mechanism
because of TOC-TOU attacks, but rather to debug, divert, and
manipulate execution of semi-cooperative processes.

Although it uses probe_kernel_write, we limit the address space
the probe can write into by checking the space with access_ok.
We do this as opposed to calling copy_to_user directly, in order
to avoid sleeping. In addition we ensure the threads's current fs
/ segment is USER_DS and the thread isn't exiting nor a kernel thread.

Given this feature is meant for experiments, and it has a risk of
crashing the system, and running programs, we print a warning on
when a proglet that attempts to use this helper is installed,
along with the pid and process name.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a12bbd32c0a6..b20438fdb029 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *unsafe_ptr = (void *) (long) r1;
+	void *src = (void *) (long) r2;
+	int size = (int) r3;
+
+	/*
+	 * Ensure we're in user context which is safe for the helper to
+	 * run. This helper has no business in a kthread.
+	 *
+	 * access_ok() should prevent writing to non-user memory, but in
+	 * some situations (nommu, temporary switch, etc) access_ok() does
+	 * not provide enough validation, hence the check on KERNEL_DS.
+	 */
+
+	if (unlikely(in_interrupt() ||
+		     current->flags & (PF_KTHREAD | PF_EXITING)))
+		return -EPERM;
+	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+		return -EPERM;
+	if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+		return -EPERM;
+
+	return probe_kernel_write(unsafe_ptr, src, size);
+}
+
+static const struct bpf_func_proto bpf_probe_write_user_proto = {
+	.func		= bpf_probe_write_user,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
+{
+	pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
+			    current->comm, task_pid_nr(current));
+
+	return &bpf_probe_write_user_proto;
+}
+
 /*
  * limited trace_printk()
  * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
@@ -362,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
+	case BPF_FUNC_probe_write_user:
+		return bpf_get_probe_write_proto();
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 4949148ad433f6f11cf837978b2907092ec99f3a Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Tue, 26 Jul 2016 15:24:24 -0700
Subject: mm: charge/uncharge kmemcg from generic page allocator paths

Currently, to charge a non-slab allocation to kmemcg one has to use
alloc_kmem_pages helper with __GFP_ACCOUNT flag.  A page allocated with
this helper should finally be freed using free_kmem_pages, otherwise it
won't be uncharged.

This API suits its current users fine, but it turns out to be impossible
to use along with page reference counting, i.e.  when an allocation is
supposed to be freed with put_page, as it is the case with pipe or unix
socket buffers.

To overcome this limitation, this patch moves charging/uncharging to
generic page allocator paths, i.e.  to __alloc_pages_nodemask and
free_pages_prepare, and zaps alloc/free_kmem_pages helpers.  This way,
one can use any of the available page allocation functions to get the
allocated page charged to kmemcg - it's enough to pass __GFP_ACCOUNT,
just like in case of kmalloc and friends.  A charged page will be
automatically uncharged on free.

To make it possible, we need to mark pages charged to kmemcg somehow.
To avoid introducing a new page flag, we make use of page->_mapcount for
marking such pages.  Since pages charged to kmemcg are not supposed to
be mapped to userspace, it should work just fine.  There are other
(ab)users of page->_mapcount - buddy and balloon pages - but we don't
conflict with them.

In case kmemcg is compiled out or not used at runtime, this patch
introduces no overhead to generic page allocator paths.  If kmemcg is
used, it will be plus one gfp flags check on alloc and plus one
page->_mapcount check on free, which shouldn't hurt performance, because
the data accessed are hot.

Link: http://lkml.kernel.org/r/a9736d856f895bcb465d9f257b54efe32eda6f99.1464079538.git.vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 4a7ec0c6c88c..de21f25e0d2c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -162,8 +162,8 @@ void __weak arch_release_thread_stack(unsigned long *stack)
 static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
-	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
-						  THREAD_SIZE_ORDER);
+	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+					     THREAD_SIZE_ORDER);
 
 	if (page)
 		memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
@@ -178,7 +178,7 @@ static inline void free_thread_stack(unsigned long *stack)
 
 	memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
 				    -(1 << THREAD_SIZE_ORDER));
-	__free_kmem_pages(page, THREAD_SIZE_ORDER);
+	__free_pages(page, THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_stack_cache;
-- 
cgit v1.2.3


From 1fe4d021acbc356723818a633fe0a10c59c2a4c1 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 26 Jul 2016 15:26:58 -0700
Subject: cgroup: fix idr leak for the first cgroup root

The valid cgroup hierarchy ID range includes 0, so we can't filter for
positive numbers when freeing it, or it'll leak the first ID.  No big
deal, just disruptive when reading the code.

The ID is freed during error handling and when the reference count hits
zero, so the double-free test is not necessary; remove it.

Link: http://lkml.kernel.org/r/20160617162359.GB19084@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75c0ff00aca6..3108150e47b1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1160,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 {
 	lockdep_assert_held(&cgroup_mutex);
 
-	if (root->hierarchy_id) {
-		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
-		root->hierarchy_id = 0;
-	}
+	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 }
 
 static void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
-		/* hierarchy ID should already have been released */
-		WARN_ON_ONCE(root->hierarchy_id);
-
 		idr_destroy(&root->cgroup_idr);
 		kfree(root);
 	}
-- 
cgit v1.2.3


From cb773df88a737d7d7e05ca7ca516414d3fcdcab8 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 26 Jul 2016 15:27:01 -0700
Subject: cgroup: remove unnecessary 0 check from css_from_id()

css_idr allocation starts at 1, so index 0 will never point to an item.
css_from_id() currently filters that before asking idr_find(), but
idr_find() would also just return NULL, so this is not needed.

Link: http://lkml.kernel.org/r/20160617162427.GC19084@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3108150e47b1..fa943843a32f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6166,7 +6166,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
-	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
+	return idr_find(&ss->css_idr, id);
 }
 
 /**
-- 
cgit v1.2.3


From bf262dcec6383188a3324192c4a7e405b3b1ad23 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jikos@kernel.org>
Date: Tue, 12 Apr 2016 05:02:09 +0930
Subject: module: fix noreturn attribute for __module_put_and_exit()

__module_put_and_exit() is makred noreturn in module.h declaration, but is
lacking the attribute in the definition, which makes some tools (such as
sparse) unhappy. Amend the definition with the attribute as well (and
reformat the declaration so that it uses more common format).

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5f71aa63ed2a..5e876977844b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -336,7 +336,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
  * A thread that wants to hold a reference to a module only while it
  * is running can call this to safely exit.  nfsd and lockd use this.
  */
-void __module_put_and_exit(struct module *mod, long code)
+void __noreturn __module_put_and_exit(struct module *mod, long code)
 {
 	module_put(mod);
 	do_exit(code);
-- 
cgit v1.2.3


From c75b590d60ffa3e31bcb9608b68006a8bab9e0ed Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 12 Apr 2016 05:03:09 +0930
Subject: module: fix redundant test.

[linux-4.5-rc4/kernel/module.c:1692]: (style) Redundant condition: attr.test.
'!attr.test || (attr.test && attr.test(mod))' is equivalent to '!attr.test ||
attr.test(mod)'

This code was added like this ten years ago, in c988d2b284549
"modules: add version and srcversion to sysfs".

Reported-by: David Binderman <dcb314@hotmail.com>
Cc: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5e876977844b..9e04a4210a4a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1693,8 +1693,7 @@ static int module_add_modinfo_attrs(struct module *mod)
 
 	temp_attr = mod->modinfo_attrs;
 	for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
-		if (!attr->test ||
-		    (attr->test && attr->test(mod))) {
+		if (!attr->test || attr->test(mod)) {
 			memcpy(temp_attr, attr, sizeof(*temp_attr));
 			sysfs_attr_init(&temp_attr->attr);
 			error = sysfs_create_file(&mod->mkobj.kobj,
-- 
cgit v1.2.3


From 3205c36cf7d96024626f92d65f560035df1abcb2 Mon Sep 17 00:00:00 2001
From: Libor Pechacek <lpechacek@suse.com>
Date: Wed, 13 Apr 2016 11:06:12 +0930
Subject: module: Issue warnings when tainting kernel

While most of the locations where a kernel taint bit is set are accompanied
with a warning message, there are two which set their bits silently.  If
the tainting module gets unloaded later on, it is almost impossible to tell
what was the reason for setting the flag.

Signed-off-by: Libor Pechacek <lpechacek@suse.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 9e04a4210a4a..0b4f3a85d4fc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2919,8 +2919,12 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
 		return -ENOEXEC;
 	}
 
-	if (!get_modinfo(info, "intree"))
+	if (!get_modinfo(info, "intree")) {
+		if (!test_taint(TAINT_OOT_MODULE))
+			pr_warn("%s: loading out-of-tree module taints kernel.\n",
+				mod->name);
 		add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+	}
 
 	if (get_modinfo(info, "staging")) {
 		add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
@@ -3089,6 +3093,8 @@ static int move_module(struct module *mod, struct load_info *info)
 
 static int check_module_license_and_versions(struct module *mod)
 {
+	int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+
 	/*
 	 * ndiswrapper is under GPL by itself, but loads proprietary modules.
 	 * Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -3107,6 +3113,9 @@ static int check_module_license_and_versions(struct module *mod)
 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
 				 LOCKDEP_NOW_UNRELIABLE);
 
+	if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+		pr_warn("%s: module license taints kernel.\n", mod->name);
+
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
-- 
cgit v1.2.3


From bca014caaa6130e57f69b5bf527967aa8ee70fdd Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Thu, 28 Apr 2016 09:24:01 +0930
Subject: module: Invalidate signatures on force-loaded modules

Signing a module should only make it trusted by the specific kernel it
was built for, not anything else.  Loading a signed module meant for a
kernel with a different ABI could have interesting effects.
Therefore, treat all signatures as invalid when a module is
force-loaded.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 0b4f3a85d4fc..7f21ab238aa7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2686,13 +2686,18 @@ static inline void kmemleak_load_module(const struct module *mod,
 #endif
 
 #ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
 {
 	int err = -ENOKEY;
 	const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
 	const void *mod = info->hdr;
 
-	if (info->len > markerlen &&
+	/*
+	 * Require flags == 0, as a module with version information
+	 * removed is no longer the module that was signed
+	 */
+	if (flags == 0 &&
+	    info->len > markerlen &&
 	    memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
 		/* We truncate the module to discard the signature */
 		info->len -= markerlen;
@@ -2711,7 +2716,7 @@ static int module_sig_check(struct load_info *info)
 	return err;
 }
 #else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
 {
 	return 0;
 }
@@ -3506,7 +3511,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	long err;
 	char *after_dashes;
 
-	err = module_sig_check(info);
+	err = module_sig_check(info, flags);
 	if (err)
 		goto free_copy;
 
-- 
cgit v1.2.3


From ce4f06dcbb5d6d04d202f1b81ac72d5679dcdfc0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 26 Jul 2016 20:57:36 +0200
Subject: stop_machine: Touch_nmi_watchdog() after MULTI_STOP_PREPARE

Suppose that stop_machine(fn) hangs because fn() hangs. In this case NMI
hard-lockup can be triggered on another CPU which does nothing wrong and
the trace from nmi_panic() won't help to investigate the problem.

And this change "fixes" the problem we (seem to) hit in practice.

 - stop_two_cpus(0, 1) races with show_state_filter() running on CPU_0.

 - CPU_1 already spins in MULTI_STOP_PREPARE state, it detects the soft
   lockup and tries to report the problem.

 - show_state_filter() enables preemption, CPU_0 calls multi_cpu_stop()
   which goes to MULTI_STOP_DISABLE_IRQ state and disables interrupts.

 - CPU_1 spends more than 10 seconds trying to flush the log buffer to
   the slow serial console.

 - NMI interrupt on CPU_0 (which now waits for CPU_1) calls nmi_panic().

Reported-by: Wang Shu <shuwang@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20160726185736.GB4088@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/stop_machine.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a467e6c28a3b..4a1ca5f6da7e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -21,6 +21,7 @@
 #include <linux/smpboot.h>
 #include <linux/atomic.h>
 #include <linux/lglock.h>
+#include <linux/nmi.h>
 
 /*
  * Structure to determine completion condition and record errors.  May
@@ -209,6 +210,13 @@ static int multi_cpu_stop(void *data)
 				break;
 			}
 			ack_state(msdata);
+		} else if (curstate > MULTI_STOP_PREPARE) {
+			/*
+			 * At this stage all other CPUs we depend on must spin
+			 * in the same loop. Any reason for hard-lockup should
+			 * be detected and reported on their side.
+			 */
+			touch_nmi_watchdog();
 		}
 	} while (curstate != MULTI_STOP_EXIT);
 
-- 
cgit v1.2.3


From 4fae16dffb812f0e0d98a0b2b0856ca48ca63e6c Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Wed, 27 Jul 2016 11:08:18 +0200
Subject: timers/core: Correct callback order during CPU hot plug

On the tear-down path, the dead CPU callback for the timers was
misplaced within the 'cpuhp_state' enumeration.  There is a hidden
dependency between the timers and block multiqueue.  The timers
callback must happen before the block multiqueue callback otherwise a
RCU stall occurs.

Move the timers callback to the proper place in the state machine.

Reported-and-tested-by: Jon Hunter <jonathanh@nvidia.com>
Reported-by: kbuild test robot <lkp@intel.com>
Fixes: 24f73b99716a ("timers/core: Convert to hotplug state machine")
Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: John Stultz <john.stultz@linaro.org>
Cc: rt@linutronix.de
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1469610498-25914-1-git-send-email-rcochran@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index f24f45915b54..341bf80f80bd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1200,11 +1200,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup = smpcfd_prepare_cpu,
 		.teardown = smpcfd_dead_cpu,
 	},
-	[CPUHP_TIMERS_DEAD] = {
-		.name = "timers dead",
-		.startup = NULL,
-		.teardown = timers_dead_cpu,
-	},
 	[CPUHP_RCUTREE_PREP] = {
 		.name = "RCU-tree prepare",
 		.startup = rcutree_prepare_cpu,
@@ -1221,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.skip_onerr		= true,
 		.cant_stop		= true,
 	},
+	/*
+	 * On the tear-down path, timers_dead_cpu() must be invoked
+	 * before blk_mq_queue_reinit_notify() from notify_dead(),
+	 * otherwise a RCU stall occurs.
+	 */
+	[CPUHP_TIMERS_DEAD] = {
+		.name = "timers dead",
+		.startup = NULL,
+		.teardown = timers_dead_cpu,
+	},
 	/* Kicks the plugged cpu into life */
 	[CPUHP_BRINGUP_CPU] = {
 		.name			= "cpu:bringup",
-- 
cgit v1.2.3


From a34c80a7294e34ba213c285dff38b1137745f94b Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 28 Jul 2016 15:45:16 -0700
Subject: freezer, oom: check TIF_MEMDIE on the correct task

freezing_slow_path() is checking TIF_MEMDIE to skip OOM killed tasks.
It is, however, checking the flag on the current task rather than the
given one.  This is really confusing because freezing() can be called
also on !current tasks.  It would end up working correctly for its main
purpose because __refrigerator will be always called on the current task
so the oom victim will never get frozen.  But it could lead to
surprising results when a task which is freezing a cgroup got oom killed
because only part of the cgroup would get frozen.  This is highly
unlikely but worth fixing as the resulting code would be more clear
anyway.

Link: http://lkml.kernel.org/r/1467029719-17602-2-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/freezer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index a8900a3bc27a..6f56a9e219fa 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)
 	if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
 		return false;
 
-	if (test_thread_flag(TIF_MEMDIE))
+	if (test_tsk_thread_flag(p, TIF_MEMDIE))
 		return false;
 
 	if (pm_nosig_freezing || cgroup_freezing(p))
-- 
cgit v1.2.3


From fec1e5f987bfc41f9f08cbd206e7302e6ac2ab0c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 28 Jul 2016 15:45:19 -0700
Subject: cpuset, mm: fix TIF_MEMDIE check in cpuset_change_task_nodemask

Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") has added TIF_MEMDIE and PF_EXITING check but
it is checking the flag on the current task rather than the given one.

This doesn't make much sense and it is actually wrong.  If the current
task which updates the nodemask of a cpuset got killed by the OOM killer
then a part of the cpuset cgroup processes would have incompatible
nodemask which is surprising to say the least.

The comment suggests the intention was to skip oom victim or an exiting
task so we should be checking the given task.  But even then it would be
layering violation because it is the memory allocator to interpret the
TIF_MEMDIE meaning.  Simply drop both checks.  All tasks in the cpuset
should simply follow the same mask.

Link: http://lkml.kernel.org/r/1467029719-17602-3-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 73e93e53884d..c7fd2778ed50 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 {
 	bool need_loop;
 
-	/*
-	 * Allow tasks that have access to memory reserves because they have
-	 * been OOM killed to get memory anywhere.
-	 */
-	if (unlikely(test_thread_flag(TIF_MEMDIE)))
-		return;
-	if (current->flags & PF_EXITING) /* Let dying task have memory */
-		return;
-
 	task_lock(tsk);
 	/*
 	 * Determine if a loop is necessary if another thread is doing
-- 
cgit v1.2.3


From 599d0c954f91d0689c9bb421b5bc04ea02437a41 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 28 Jul 2016 15:45:31 -0700
Subject: mm, vmscan: move LRU lists to node

This moves the LRU lists from the zone to the node and related data such
as counters, tracing, congestion tracking and writeback tracking.

Unfortunately, due to reclaim and compaction retry logic, it is
necessary to account for the number of LRU pages on both zone and node
logic.  Most reclaim logic is based on the node counters but the retry
logic uses the zone counters which do not distinguish inactive and
active sizes.  It would be possible to leave the LRU counters on a
per-zone basis but it's a heavier calculation across multiple cache
lines that is much more frequent than the retry checks.

Other than the LRU counters, this is mostly a mechanical patch but note
that it introduces a number of anomalies.  For example, the scans are
per-zone but using per-node counters.  We also mark a node as congested
when a zone is congested.  This causes weird problems that are fixed
later but is easier to review.

In the event that there is excessive overhead on 32-bit systems due to
the nodes being on LRU then there are two potential solutions

1. Long-term isolation of highmem pages when reclaim is lowmem

   When pages are skipped, they are immediately added back onto the LRU
   list. If lowmem reclaim persisted for long periods of time, the same
   highmem pages get continually scanned. The idea would be that lowmem
   keeps those pages on a separate list until a reclaim for highmem pages
   arrives that splices the highmem pages back onto the LRU. It potentially
   could be implemented similar to the UNEVICTABLE list.

   That would reduce the skip rate with the potential corner case is that
   highmem pages have to be scanned and reclaimed to free lowmem slab pages.

2. Linear scan lowmem pages if the initial LRU shrink fails

   This will break LRU ordering but may be preferable and faster during
   memory pressure than skipping LRU pages.

Link: http://lkml.kernel.org/r/1467970510-21195-4-git-send-email-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d90df926b59f..9a0178c2ac1d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1627,11 +1627,11 @@ static unsigned long minimum_image_size(unsigned long saveable)
 	unsigned long size;
 
 	size = global_page_state(NR_SLAB_RECLAIMABLE)
-		+ global_page_state(NR_ACTIVE_ANON)
-		+ global_page_state(NR_INACTIVE_ANON)
-		+ global_page_state(NR_ACTIVE_FILE)
-		+ global_page_state(NR_INACTIVE_FILE)
-		- global_page_state(NR_FILE_MAPPED);
+		+ global_node_page_state(NR_ACTIVE_ANON)
+		+ global_node_page_state(NR_INACTIVE_ANON)
+		+ global_node_page_state(NR_ACTIVE_FILE)
+		+ global_node_page_state(NR_INACTIVE_FILE)
+		- global_node_page_state(NR_FILE_MAPPED);
 
 	return saveable <= size ? 0 : saveable - size;
 }
-- 
cgit v1.2.3


From a5f5f91da6ad647fb0cc7fce0e17343c0d1c5a9a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 28 Jul 2016 15:46:32 -0700
Subject: mm: convert zone_reclaim to node_reclaim

As reclaim is now per-node based, convert zone_reclaim to be
node_reclaim.  It is possible that a node will be reclaimed multiple
times if it has multiple zones but this is unavoidable without caching
all nodes traversed so far.  The documentation and interface to
userspace is the same from a configuration perspective and will will be
similar in behaviour unless the node-local allocation requests were also
limited to lower zones.

Link: http://lkml.kernel.org/r/1467970510-21195-24-git-send-email-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 35f0dcb1cb4f..53954631a4e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_NUMA
 	{
 		.procname	= "zone_reclaim_mode",
-		.data		= &zone_reclaim_mode,
-		.maxlen		= sizeof(zone_reclaim_mode),
+		.data		= &node_reclaim_mode,
+		.maxlen		= sizeof(node_reclaim_mode),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 		.extra1		= &zero,
-- 
cgit v1.2.3


From 11db04864336f20e19e16b64ade781eeefc3f6d3 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 28 Jul 2016 15:48:11 -0700
Subject: mm: cleanup ifdef guards for vmem_altmap

Now that ZONE_DEVICE depends on SPARSEMEM_VMEMMAP we can simplify some
ifdef guards to just ZONE_DEVICE.

Link: http://lkml.kernel.org/r/146687646788.39261.8020536391978771940.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 017532193fb1..ddb3247a872a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -308,12 +308,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	if (is_ram == REGION_INTERSECTS)
 		return __va(res->start);
 
-	if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
-		dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
-				__func__);
-		return ERR_PTR(-ENXIO);
-	}
-
 	if (!ref)
 		return ERR_PTR(-EINVAL);
 
@@ -401,7 +395,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
 	altmap->alloc -= nr_pfns;
 }
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
 struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 {
 	/*
@@ -427,5 +420,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 
 	return pgmap ? pgmap->altmap : NULL;
 }
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 #endif /* CONFIG_ZONE_DEVICE */
-- 
cgit v1.2.3


From d30dd8be06a5ae640766b20ea9ae288832bd12ac Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jul 2016 15:48:14 -0700
Subject: mm: track NR_KERNEL_STACK in KiB instead of number of stacks

Currently, NR_KERNEL_STACK tracks the number of kernel stacks in a zone.
This only makes sense if each kernel stack exists entirely in one zone,
and allowing vmapped stacks could break this assumption.

Since frv has THREAD_SIZE < PAGE_SIZE, we need to track kernel stack
allocations in a unit that divides both THREAD_SIZE and PAGE_SIZE on all
architectures.  Keep it simple and use KiB.

Link: http://lkml.kernel.org/r/083c71e642c5fa5f1b6898902e1b2db7b48940d4.1468523549.git.luto@kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index de21f25e0d2c..af3637e0ee52 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -225,7 +225,8 @@ static void account_kernel_stack(unsigned long *stack, int account)
 {
 	struct zone *zone = page_zone(virt_to_page(stack));
 
-	mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+	mod_zone_page_state(zone, NR_KERNEL_STACK_KB,
+			    THREAD_SIZE / 1024 * account);
 }
 
 void free_task(struct task_struct *tsk)
-- 
cgit v1.2.3


From efdc94907977d2db84b4b00cb9bd98ca011f6819 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jul 2016 15:48:17 -0700
Subject: mm: fix memcg stack accounting for sub-page stacks

We should account for stacks regardless of stack size, and we need to
account in sub-page units if THREAD_SIZE < PAGE_SIZE.  Change the units
to kilobytes and Move it into account_kernel_stack().

Fixes: 12580e4b54ba8 ("mm: memcontrol: report kernel stack usage in cgroup2 memory.stat")
Link: http://lkml.kernel.org/r/9b5314e3ee5eda61b0317ec1563768602c1ef438.1468523549.git.luto@kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index af3637e0ee52..52e725d4a866 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -165,20 +165,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
 					     THREAD_SIZE_ORDER);
 
-	if (page)
-		memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-					    1 << THREAD_SIZE_ORDER);
-
 	return page ? page_address(page) : NULL;
 }
 
 static inline void free_thread_stack(unsigned long *stack)
 {
-	struct page *page = virt_to_page(stack);
-
-	memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-				    -(1 << THREAD_SIZE_ORDER));
-	__free_pages(page, THREAD_SIZE_ORDER);
+	__free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_stack_cache;
@@ -223,10 +215,15 @@ static struct kmem_cache *mm_cachep;
 
 static void account_kernel_stack(unsigned long *stack, int account)
 {
-	struct zone *zone = page_zone(virt_to_page(stack));
+	/* All stack pages are in the same zone and belong to the same memcg. */
+	struct page *first_page = virt_to_page(stack);
 
-	mod_zone_page_state(zone, NR_KERNEL_STACK_KB,
+	mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
 			    THREAD_SIZE / 1024 * account);
+
+	memcg_kmem_update_page_stat(
+		first_page, MEMCG_KERNEL_STACK_KB,
+		account * (THREAD_SIZE / 1024));
 }
 
 void free_task(struct task_struct *tsk)
-- 
cgit v1.2.3


From 8b70ca65616b3588ea1907e87f0df6d2530350df Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jul 2016 15:48:23 -0700
Subject: printk: when dumping regs, show the stack, not thread_info

We currently show:

  task: <current> ti: <current_thread_info()> task.ti: <task_thread_info(current)>"

"ti" and "task.ti" are redundant, and neither is actually what we want
to show, which the the base of the thread stack.  Change the display to
show the stack pointer explicitly.

Link: http://lkml.kernel.org/r/543ac5bd66ff94000a57a02e11af7239571a3055.1468523549.git.luto@kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 60cdf6386763..d4de33934dac 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl)
 {
 	dump_stack_print_info(log_lvl);
 
-	printk("%stask: %p ti: %p task.ti: %p\n",
-	       log_lvl, current, current_thread_info(),
-	       task_thread_info(current));
+	printk("%stask: %p task.stack: %p\n",
+	       log_lvl, current, task_stack_page(current));
 }
 
 #endif
-- 
cgit v1.2.3


From e3f91083facb792dc8d8fd0a59639e4d6e7c0c8f Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Sat, 23 Jul 2016 14:42:37 +0530
Subject: jump_label: Make it possible for arches to invoke jump_label_init()
 earlier

Some arches (powerpc at least) would like to invoke jump_label_init()
much earlier in boot. So check static_key_initialized in order to make
sure this function runs only once.

LGTM-by: Ingo (http://marc.info/?l=linux-kernel&m=144049104329961&w=2)
Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 kernel/jump_label.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0dbea887d625..2d693be967df 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -235,6 +235,9 @@ void __init jump_label_init(void)
 	struct static_key *key = NULL;
 	struct jump_entry *iter;
 
+	if (static_key_initialized)
+		return;
+
 	jump_label_lock();
 	jump_label_sort_entries(iter_start, iter_stop);
 
-- 
cgit v1.2.3


From 0d87d7ec22a0879d3926faa4f4f4412a5dee1fba Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 1 Aug 2016 13:49:29 -0700
Subject: perf/core: Change log level for duration warning to KERN_INFO

When the perf interrupt handler exceeds a threshold warning messages
are displayed on console:

  [12739.31793] perf interrupt took too long (2504 > 2500), lowering kernel.perf_event_max_sample_rate to 50000
  [71340.165065] perf interrupt took too long (5005 > 5000), lowering kernel.perf_event_max_sample_rate to 25000

Many customers and users are confused by the message wondering if
something is wrong or they need to take action to fix a problem.
Since a user can not do anything to fix the issue, the message is really
more informational than a warning. Adjust the log level accordingly.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1470084569-438-1-git-send-email-dsa@cumulusnetworks.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 356a6c7cb52a..a19550d80ab1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -448,7 +448,7 @@ static u64 __report_allowed;
 
 static void perf_duration_warn(struct irq_work *w)
 {
-	printk_ratelimited(KERN_WARNING
+	printk_ratelimited(KERN_INFO
 		"perf: interrupt took too long (%lld > %lld), lowering "
 		"kernel.perf_event_max_sample_rate to %d\n",
 		__report_avg, __report_allowed,
-- 
cgit v1.2.3


From 377ccbb483738f84400ddf5840c7dd8825716985 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 28 Jul 2016 22:30:43 -0400
Subject: Makefile: Mute warning for __builtin_return_address(>0) for tracing
 only

With the latest gcc compilers, they give a warning if
__builtin_return_address() parameter is greater than 0. That is because if
it is used by a function called by a top level function (or in the case of
the kernel, by assembly), it can try to access stack frames outside the
stack and crash the system.

The tracing system uses __builtin_return_address() of up to 2! But it is
well aware of the dangers that it may have, and has even added precautions
to protect against it (see the thunk code in arch/x86/entry/thunk*.S)

Linus originally added KBUILD_CFLAGS that would suppress the warning for the
entire kernel, as simply adding KBUILD_CFLAGS to the tracing directory
wouldn't work. The tracing directory plays a bit with the CFLAGS and
requires a little more logic.

This adds that special logic to only suppress the warning for the tracing
directory. If it is used anywhere else outside of tracing, the warning will
still be triggered.

Link: http://lkml.kernel.org/r/20160728223043.51996267@grimm.local.home

Tested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979e7bfbde7a..d0a1617b52b4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,4 +1,8 @@
 
+# We are fully aware of the dangers of __builtin_return_address()
+FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
+KBUILD_CFLAGS += $(FRAME_CFLAGS)
+
 # Do not instrument the tracer itself:
 
 ifdef CONFIG_FUNCTION_TRACER
-- 
cgit v1.2.3


From 47c1856971dd05cac730f70d073518da021b2e5c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 29 Jun 2016 19:55:59 -0500
Subject: tracing: Fix use-after-free in hist_unreg_all/hist_enable_unreg_all

While running tools/testing/selftests test suite with KASAN, Dmitry
Vyukov hit the following use-after-free report:

  ==================================================================
  BUG: KASAN: use-after-free in hist_unreg_all+0x1a1/0x1d0 at addr
  ffff880031632cc0
  Read of size 8 by task ftracetest/7413
  ==================================================================
  BUG kmalloc-128 (Not tainted): kasan: bad access detected
  ------------------------------------------------------------------

This fixes the problem, along with the same problem in
hist_enable_unreg_all().

Link: http://lkml.kernel.org/r/c3d05b79e42555b6e36a3a99aae0e37315ee5304.1467247517.git.tom.zanussi@linux.intel.com

Cc: Dmitry Vyukov <dvyukov@google.com>
[Copied Steve's hist_enable_unreg_all() fix to hist_unreg_all()]
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0c05b8a99806..19ae135120a3 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1500,9 +1500,9 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
 
 static void hist_unreg_all(struct trace_event_file *file)
 {
-	struct event_trigger_data *test;
+	struct event_trigger_data *test, *n;
 
-	list_for_each_entry_rcu(test, &file->triggers, list) {
+	list_for_each_entry_safe(test, n, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
 			list_del_rcu(&test->list);
 			trace_event_trigger_enable_disable(file, 0);
@@ -1699,9 +1699,9 @@ hist_enable_get_trigger_ops(char *cmd, char *param)
 
 static void hist_enable_unreg_all(struct trace_event_file *file)
 {
-	struct event_trigger_data *test;
+	struct event_trigger_data *test, *n;
 
-	list_for_each_entry_rcu(test, &file->triggers, list) {
+	list_for_each_entry_safe(test, n, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) {
 			list_del_rcu(&test->list);
 			update_cond_flag(file);
-- 
cgit v1.2.3


From 7522c03ae307e657114ff909aec650304371a134 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Wed, 29 Jun 2016 19:56:00 -0500
Subject: tracing: Fix use-after-free in hist_register_trigger()

This fixes a use-after-free case flagged by KASAN; make sure the test
happens before the potential free in this case.

Link: http://lkml.kernel.org/r/48fd74ab61bebd7dca9714386bb47d7c5ccd6a7b.1467247517.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 19ae135120a3..f3a960ed75a1 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1441,6 +1441,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 		goto out;
 	}
 
+	if (hist_data->attrs->pause)
+		data->paused = true;
+
 	if (named_data) {
 		destroy_hist_data(data->private_data);
 		data->private_data = named_data->private_data;
@@ -1448,9 +1451,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 		data->ops = &event_hist_trigger_named_ops;
 	}
 
-	if (hist_data->attrs->pause)
-		data->paused = true;
-
 	if (data->ops->init) {
 		ret = data->ops->init(data->ops, data);
 		if (ret < 0)
-- 
cgit v1.2.3


From 61e96496d3c949701a48b908f99f4ed891cd1101 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 2 Aug 2016 14:03:44 -0700
Subject: task_work: use READ_ONCE/lockless_dereference, avoid pi_lock if
 !task_works

Change task_work_cancel() to use lockless_dereference(), this is what
the code really wants but we didn't have this helper when it was
written.

Also add the fast-path task->task_works == NULL check, in the likely
case this task has no pending works and we can avoid
spin_lock(task->pi_lock).

While at it, change other users of ACCESS_ONCE() to use READ_ONCE().

Link: http://lkml.kernel.org/r/20160610150042.GA13868@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/task_work.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/task_work.c b/kernel/task_work.c
index 6ab4842b00e8..d513051fcca2 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -29,7 +29,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
 	struct callback_head *head;
 
 	do {
-		head = ACCESS_ONCE(task->task_works);
+		head = READ_ONCE(task->task_works);
 		if (unlikely(head == &work_exited))
 			return -ESRCH;
 		work->next = head;
@@ -57,6 +57,9 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 	struct callback_head **pprev = &task->task_works;
 	struct callback_head *work;
 	unsigned long flags;
+
+	if (likely(!task->task_works))
+		return NULL;
 	/*
 	 * If cmpxchg() fails we continue without updating pprev.
 	 * Either we raced with task_work_add() which added the
@@ -64,8 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 	 * we raced with task_work_run(), *pprev == NULL/exited.
 	 */
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
-	while ((work = ACCESS_ONCE(*pprev))) {
-		smp_read_barrier_depends();
+	while ((work = lockless_dereference(*pprev))) {
 		if (work->func != func)
 			pprev = &work->next;
 		else if (cmpxchg(pprev, work, work->next) == work)
@@ -95,7 +97,7 @@ void task_work_run(void)
 		 * work_exited unless the list is empty.
 		 */
 		do {
-			work = ACCESS_ONCE(task->task_works);
+			work = READ_ONCE(task->task_works);
 			head = !work && (task->flags & PF_EXITING) ?
 				&work_exited : NULL;
 		} while (cmpxchg(&task->task_works, work, head) != work);
-- 
cgit v1.2.3


From 9d5059c959ac739dbf837cec14586e58e7a67292 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@osg.samsung.com>
Date: Tue, 2 Aug 2016 14:03:47 -0700
Subject: dynamic_debug: only add header when used

kernel.h header doesn't directly use dynamic debug, instead we can
include it in module.c (which used it via kernel.h).  printk.h only uses
it if CONFIG_DYNAMIC_DEBUG is on, changing the inclusion to only happen
in that case.

Link: http://lkml.kernel.org/r/1468429793-16917-1-git-send-email-luisbg@osg.samsung.com
[luisbg@osg.samsung.com: include dynamic_debug.h in drb_int.h]
  Link: http://lkml.kernel.org/r/1468447828-18558-2-git-send-email-luisbg@osg.samsung.com
Signed-off-by: Luis de Bethencourt <luisbg@osg.samsung.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5f71aa63ed2a..a0f48b8b00da 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -60,6 +60,7 @@
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
+#include <linux/dynamic_debug.h>
 #include <uapi/linux/module.h>
 #include "module-internal.h"
 
-- 
cgit v1.2.3


From bebca05281d039e4144e1c51f402fd1d5f54b5e2 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 2 Aug 2016 14:03:50 -0700
Subject: printk: do not include interrupt.h

A trivial cosmetic change: interrupt.h header is redundant since commit
6b898c07cb1d ("console: use might_sleep in console_lock").

Link: http://lkml.kernel.org/r/20160620132847.21930-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d4de33934dac..09af62e71fee 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -26,7 +26,6 @@
 #include <linux/nmi.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/interrupt.h>			/* For in_interrupt() */
 #include <linux/delay.h>
 #include <linux/smp.h>
 #include <linux/security.h>
-- 
cgit v1.2.3


From 874f9c7da9a4acbc1b9e12ca722579fb50e4d142 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 2 Aug 2016 14:03:53 -0700
Subject: printk: create pr_<level> functions

Using functions instead of macros can reduce overall code size by
eliminating unnecessary "KERN_SOH<digit>" prefixes from format strings.

  defconfig x86-64:

  $ size vmlinux*
     text    data     bss      dec     hex  filename
  10193570 4331464 1105920 15630954  ee826a vmlinux.new
  10192623 4335560 1105920 15634103  ee8eb7 vmlinux.old

As the return value are unimportant and unused in the kernel tree, these
new functions return void.

Miscellanea:

 - change pr_<level> macros to call new __pr_<level> functions
 - change vprintk_nmi and vprintk_default to add LOGLEVEL_<level> argument

[akpm@linux-foundation.org: fix LOGLEVEL_INFO, per Joe]
Link: http://lkml.kernel.org/r/e16cc34479dfefcae37c98b481e6646f0f69efc3.1466718827.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/internal.h | 16 ++++++++++------
 kernel/printk/nmi.c      | 13 +++++++++++--
 kernel/printk/printk.c   | 27 ++++++++++++++++++++++++---
 3 files changed, 45 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7fd2838fa417..5d4505f30083 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,9 +16,11 @@
  */
 #include <linux/percpu.h>
 
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+typedef __printf(2, 0) int (*printk_func_t)(int level, const char *fmt,
+					    va_list args);
 
-int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
+__printf(2, 0)
+int vprintk_default(int level, const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK_NMI
 
@@ -31,9 +33,10 @@ extern raw_spinlock_t logbuf_lock;
  * via per-CPU variable.
  */
 DECLARE_PER_CPU(printk_func_t, printk_func);
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+__printf(2, 0)
+static inline int vprintk_func(int level, const char *fmt, va_list args)
 {
-	return this_cpu_read(printk_func)(fmt, args);
+	return this_cpu_read(printk_func)(level, fmt, args);
 }
 
 extern atomic_t nmi_message_lost;
@@ -44,9 +47,10 @@ static inline int get_nmi_message_lost(void)
 
 #else /* CONFIG_PRINTK_NMI */
 
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+__printf(2, 0)
+static inline int vprintk_func(int level, const char *fmt, va_list args)
 {
-	return vprintk_default(fmt, args);
+	return vprintk_default(level, fmt, args);
 }
 
 static inline int get_nmi_message_lost(void)
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index b69eb8a2876f..bc3eeb1ae6da 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
  * one writer running. But the buffer might get flushed from another
  * CPU, so we need to be careful.
  */
-static int vprintk_nmi(const char *fmt, va_list args)
+static int vprintk_nmi(int level, const char *fmt, va_list args)
 {
 	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
 	int add = 0;
@@ -79,7 +79,16 @@ again:
 	if (!len)
 		smp_rmb();
 
-	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+	if (level != LOGLEVEL_DEFAULT) {
+		add = snprintf(s->buffer + len, sizeof(s->buffer) - len,
+				KERN_SOH "%c", '0' + level);
+		add += vsnprintf(s->buffer + len + add,
+				 sizeof(s->buffer) - len - add,
+				 fmt, args);
+	} else {
+		add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len,
+				fmt, args);
+	}
 
 	/*
 	 * Do it once again if the buffer has been flushed in the meantime.
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 09af62e71fee..d2accf2f4448 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1801,7 +1801,28 @@ asmlinkage int printk_emit(int facility, int level,
 }
 EXPORT_SYMBOL(printk_emit);
 
-int vprintk_default(const char *fmt, va_list args)
+#ifdef CONFIG_PRINTK
+#define define_pr_level(func, loglevel)				\
+asmlinkage __visible void func(const char *fmt, ...)		\
+{								\
+	va_list args;						\
+								\
+	va_start(args, fmt);					\
+	vprintk_default(loglevel, fmt, args);			\
+	va_end(args);						\
+}								\
+EXPORT_SYMBOL(func)
+
+define_pr_level(__pr_emerg, LOGLEVEL_EMERG);
+define_pr_level(__pr_alert, LOGLEVEL_ALERT);
+define_pr_level(__pr_crit, LOGLEVEL_CRIT);
+define_pr_level(__pr_err, LOGLEVEL_ERR);
+define_pr_level(__pr_warn, LOGLEVEL_WARNING);
+define_pr_level(__pr_notice, LOGLEVEL_NOTICE);
+define_pr_level(__pr_info, LOGLEVEL_INFO);
+#endif
+
+int vprintk_default(int level, const char *fmt, va_list args)
 {
 	int r;
 
@@ -1811,7 +1832,7 @@ int vprintk_default(const char *fmt, va_list args)
 		return r;
 	}
 #endif
-	r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+	r = vprintk_emit(0, level, NULL, 0, fmt, args);
 
 	return r;
 }
@@ -1844,7 +1865,7 @@ asmlinkage __visible int printk(const char *fmt, ...)
 	int r;
 
 	va_start(args, fmt);
-	r = vprintk_func(fmt, args);
+	r = vprintk_func(LOGLEVEL_DEFAULT, fmt, args);
 	va_end(args);
 
 	return r;
-- 
cgit v1.2.3


From cf7754441c563230ed74096fcd4b8cca49910550 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 2 Aug 2016 14:03:56 -0700
Subject: printk: introduce suppress_message_printing()

Messages' levels and console log level are inspected when the actual
printing occurs, which may provoke console_unlock() and
console_cont_flush() to waste CPU cycles on every message that has
loglevel above the current console_loglevel.

Schematically, console_unlock() does the following:

console_unlock()
{
        ...
        for (;;) {
                ...
                raw_spin_lock_irqsave(&logbuf_lock, flags);
skip:
                msg = log_from_idx(console_idx);

                if (msg->flags & LOG_NOCONS) {
                        ...
                        goto skip;
                }

                level = msg->level;
                len += msg_print_text();                        >> sprintfs
                                                                   memcpy,
                                                                   etc.

                if (nr_ext_console_drivers) {
                        ext_len = msg_print_ext_header();       >> scnprintf
                        ext_len += msg_print_ext_body();        >> scnprintfs
                                                                   etc.
                }
                ...
                raw_spin_unlock(&logbuf_lock);

                call_console_drivers(level, ext_text, ext_len, text, len)
                {
                        if (level >= console_loglevel &&        >> drop the message
                                        !ignore_loglevel)
                                return;

                        console->write(...);
                }

                local_irq_restore(flags);
        }
        ...
}

The thing here is this deferred `level >= console_loglevel' check.  We
are wasting CPU cycles on sprintfs/memcpy/etc.  preparing the messages
that we will eventually drop.

This can be huge when we register a new CON_PRINTBUFFER console, for
instance.  For every such a console register_console() resets the

        console_seq, console_idx, console_prev

and sets a `exclusive console' pointer to replay the log buffer to that
just-registered console.  And there can be a lot of messages to replay,
in the worst case most of which can be dropped after console_loglevel
test.

We know messages' levels long before we call msg_print_text() and
friends, so we can just move console_loglevel check out of
call_console_drivers() and format a new message only if we are sure that
it won't be dropped.

The patch factors out loglevel check into suppress_message_printing()
function and tests message->level and console_loglevel before formatting
functions in console_unlock() and console_cont_flush() are getting
executed.  This improves things not only for exclusive CON_PRINTBUFFER
consoles, but for every console_unlock() that attempts to print a
message of level above the console_loglevel.

Link: http://lkml.kernel.org/r/20160627135012.8229-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Calvin Owens <calvinowens@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d2accf2f4448..8bdce14254f4 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -985,6 +985,11 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(ignore_loglevel,
 		 "ignore loglevel setting (prints all kernel messages to the console)");
 
+static bool suppress_message_printing(int level)
+{
+	return (level >= console_loglevel && !ignore_loglevel);
+}
+
 #ifdef CONFIG_BOOT_PRINTK_DELAY
 
 static int boot_delay; /* msecs delay after each printk during bootup */
@@ -1014,7 +1019,7 @@ static void boot_delay_msec(int level)
 	unsigned long timeout;
 
 	if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
-		|| (level >= console_loglevel && !ignore_loglevel)) {
+		|| suppress_message_printing(level)) {
 		return;
 	}
 
@@ -1438,8 +1443,6 @@ static void call_console_drivers(int level,
 
 	trace_console(text, len);
 
-	if (level >= console_loglevel && !ignore_loglevel)
-		return;
 	if (!console_drivers)
 		return;
 
@@ -1908,6 +1911,7 @@ static void call_console_drivers(int level,
 static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
 			     bool syslog, char *buf, size_t size) { return 0; }
 static size_t cont_print_text(char *text, size_t size) { return 0; }
+static bool suppress_message_printing(int level) { return false; }
 
 /* Still needs to be defined for users */
 DEFINE_PER_CPU(printk_func_t, printk_func);
@@ -2187,6 +2191,13 @@ static void console_cont_flush(char *text, size_t size)
 	if (!cont.len)
 		goto out;
 
+	if (suppress_message_printing(cont.level)) {
+		cont.cons = cont.len;
+		if (cont.flushed)
+			cont.len = 0;
+		goto out;
+	}
+
 	/*
 	 * We still queue earlier records, likely because the console was
 	 * busy. The earlier ones need to be printed before this one, we
@@ -2290,10 +2301,13 @@ skip:
 			break;
 
 		msg = log_from_idx(console_idx);
-		if (msg->flags & LOG_NOCONS) {
+		level = msg->level;
+		if ((msg->flags & LOG_NOCONS) ||
+				suppress_message_printing(level)) {
 			/*
 			 * Skip record we have buffered and already printed
-			 * directly to the console when we received it.
+			 * directly to the console when we received it, and
+			 * record that has level above the console loglevel.
 			 */
 			console_idx = log_next(console_idx);
 			console_seq++;
@@ -2307,7 +2321,6 @@ skip:
 			goto skip;
 		}
 
-		level = msg->level;
 		len += msg_print_text(msg, console_prev, false,
 				      text + len, sizeof(text) - len);
 		if (nr_ext_console_drivers) {
-- 
cgit v1.2.3


From 40a7d9f5f90681c6d7890b6a07f230bb4afe7e39 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Aug 2016 14:03:59 -0700
Subject: printk: include <asm/sections.h> instead of <asm-generic/sections.h>

asm-generic headers are generic implementations for architecture
specific code and should not be included by common code.  Thus use the
asm/ version of sections.h to get at the linker sections.

Link: http://lkml.kernel.org/r/1468285008-7331-1-git-send-email-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8bdce14254f4..70c66c5ba212 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -47,7 +47,7 @@
 #include <linux/uio.h>
 
 #include <asm/uaccess.h>
-#include <asm-generic/sections.h>
+#include <asm/sections.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
-- 
cgit v1.2.3


From 750afe7babd117daabebf4855da18e4418ea845e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 2 Aug 2016 14:04:07 -0700
Subject: printk: add kernel parameter to control writes to /dev/kmsg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a "printk.devkmsg" kernel command line parameter which controls how
userspace writes into /dev/kmsg.  It has three options:

 * ratelimit - ratelimit logging from userspace.
 * on  - unlimited logging from userspace
 * off - logging from userspace gets ignored

The default setting is to ratelimit the messages written to it.

This changes the kernel default setting of "on" to "ratelimit" and we do
that because we want to keep userspace spamming /dev/kmsg to sane
levels.  This is especially moot when a small kernel log buffer wraps
around and messages get lost.  So the ratelimiting setting should be a
sane setting where kernel messages should have a bit higher chance of
survival from all the spamming.

It additionally does not limit logging to /dev/kmsg while the system is
booting if we haven't disabled it on the command line.

Furthermore, we can control the logging from a lower priority sysctl
interface - kernel.printk_devkmsg.

That interface will succeed only if printk.devkmsg *hasn't* been
supplied on the command line.  If it has, then printk.devkmsg is a
one-time setting which remains for the duration of the system lifetime.
This "locking" of the setting is to prevent userspace from changing the
logging on us through sysctl(2).

This patch is based on previous patches from Linus and Steven.

[bp@suse.de: fixes]
  Link: http://lkml.kernel.org/r/20160719072344.GC25563@nazgul.tnic
Link: http://lkml.kernel.org/r/20160716061745.15795-3-bp@alien8.de
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Dave Young <dyoung@redhat.com>
Cc: Franck Bui <fbui@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sysctl.c        |   7 +++
 2 files changed, 141 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 70c66c5ba212..a5ef95ca18c9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -85,6 +85,111 @@ static struct lockdep_map console_lock_dep_map = {
 };
 #endif
 
+enum devkmsg_log_bits {
+	__DEVKMSG_LOG_BIT_ON = 0,
+	__DEVKMSG_LOG_BIT_OFF,
+	__DEVKMSG_LOG_BIT_LOCK,
+};
+
+enum devkmsg_log_masks {
+	DEVKMSG_LOG_MASK_ON             = BIT(__DEVKMSG_LOG_BIT_ON),
+	DEVKMSG_LOG_MASK_OFF            = BIT(__DEVKMSG_LOG_BIT_OFF),
+	DEVKMSG_LOG_MASK_LOCK           = BIT(__DEVKMSG_LOG_BIT_LOCK),
+};
+
+/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */
+#define DEVKMSG_LOG_MASK_DEFAULT	0
+
+static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+
+static int __control_devkmsg(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strncmp(str, "on", 2)) {
+		devkmsg_log = DEVKMSG_LOG_MASK_ON;
+		return 2;
+	} else if (!strncmp(str, "off", 3)) {
+		devkmsg_log = DEVKMSG_LOG_MASK_OFF;
+		return 3;
+	} else if (!strncmp(str, "ratelimit", 9)) {
+		devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+		return 9;
+	}
+	return -EINVAL;
+}
+
+static int __init control_devkmsg(char *str)
+{
+	if (__control_devkmsg(str) < 0)
+		return 1;
+
+	/*
+	 * Set sysctl string accordingly:
+	 */
+	if (devkmsg_log == DEVKMSG_LOG_MASK_ON) {
+		memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
+		strncpy(devkmsg_log_str, "on", 2);
+	} else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) {
+		memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
+		strncpy(devkmsg_log_str, "off", 3);
+	}
+	/* else "ratelimit" which is set by default. */
+
+	/*
+	 * Sysctl cannot change it anymore. The kernel command line setting of
+	 * this parameter is to force the setting to be permanent throughout the
+	 * runtime of the system. This is a precation measure against userspace
+	 * trying to be a smarta** and attempting to change it up on us.
+	 */
+	devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
+
+	return 0;
+}
+__setup("printk.devkmsg=", control_devkmsg);
+
+char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
+
+int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
+			      void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char old_str[DEVKMSG_STR_MAX_SIZE];
+	unsigned int old;
+	int err;
+
+	if (write) {
+		if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK)
+			return -EINVAL;
+
+		old = devkmsg_log;
+		strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE);
+	}
+
+	err = proc_dostring(table, write, buffer, lenp, ppos);
+	if (err)
+		return err;
+
+	if (write) {
+		err = __control_devkmsg(devkmsg_log_str);
+
+		/*
+		 * Do not accept an unknown string OR a known string with
+		 * trailing crap...
+		 */
+		if (err < 0 || (err + 1 != *lenp)) {
+
+			/* ... and restore old setting. */
+			devkmsg_log = old;
+			strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE);
+
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Number of registered extended console drivers.
  *
@@ -613,6 +718,7 @@ struct devkmsg_user {
 	u64 seq;
 	u32 idx;
 	enum log_flags prev;
+	struct ratelimit_state rs;
 	struct mutex lock;
 	char buf[CONSOLE_EXT_LOG_MAX];
 };
@@ -622,11 +728,24 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 	char *buf, *line;
 	int level = default_message_loglevel;
 	int facility = 1;	/* LOG_USER */
+	struct file *file = iocb->ki_filp;
+	struct devkmsg_user *user = file->private_data;
 	size_t len = iov_iter_count(from);
 	ssize_t ret = len;
 
-	if (len > LOG_LINE_MAX)
+	if (!user || len > LOG_LINE_MAX)
 		return -EINVAL;
+
+	/* Ignore when user logging is disabled. */
+	if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+		return len;
+
+	/* Ratelimit when not explicitly enabled. */
+	if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) {
+		if (!___ratelimit(&user->rs, current->comm))
+			return ret;
+	}
+
 	buf = kmalloc(len+1, GFP_KERNEL);
 	if (buf == NULL)
 		return -ENOMEM;
@@ -799,19 +918,24 @@ static int devkmsg_open(struct inode *inode, struct file *file)
 	struct devkmsg_user *user;
 	int err;
 
-	/* write-only does not need any file context */
-	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-		return 0;
+	if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+		return -EPERM;
 
-	err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
-				       SYSLOG_FROM_READER);
-	if (err)
-		return err;
+	/* write-only does not need any file context */
+	if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+		err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+					       SYSLOG_FROM_READER);
+		if (err)
+			return err;
+	}
 
 	user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
 	if (!user)
 		return -ENOMEM;
 
+	ratelimit_default_init(&user->rs);
+	ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE);
+
 	mutex_init(&user->lock);
 
 	raw_spin_lock_irq(&logbuf_lock);
@@ -830,6 +954,8 @@ static int devkmsg_release(struct inode *inode, struct file *file)
 	if (!user)
 		return 0;
 
+	ratelimit_state_exit(&user->rs);
+
 	mutex_destroy(&user->lock);
 	kfree(user);
 	return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 53954631a4e1..b43d0b27c1fe 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -813,6 +813,13 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &ten_thousand,
 	},
+	{
+		.procname	= "printk_devkmsg",
+		.data		= devkmsg_log_str,
+		.maxlen		= DEVKMSG_STR_MAX_SIZE,
+		.mode		= 0644,
+		.proc_handler	= devkmsg_sysctl_set_loglvl,
+	},
 	{
 		.procname	= "dmesg_restrict",
 		.data		= &dmesg_restrict,
-- 
cgit v1.2.3


From 627393d44860386e948bb63a8e5b53f2cc44d070 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 2 Aug 2016 14:05:40 -0700
Subject: kernel/exit.c: quieten greatest stack depth printk

Many targets enable CONFIG_DEBUG_STACK_USAGE, and while the information
is useful, it isn't worthy of pr_warn().  Reduce it to pr_info().

Link: http://lkml.kernel.org/r/1466982072-29836-1-git-send-email-anton@ozlabs.org
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 84ae830234f8..2f974ae042a6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -715,7 +715,7 @@ static void check_stack_usage(void)
 
 	spin_lock(&low_water_lock);
 	if (free < lowest_to_date) {
-		pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
+		pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
 			current->comm, task_pid_nr(current), free);
 		lowest_to_date = free;
 	}
-- 
cgit v1.2.3


From 4caf9615247aceab56e91df6c0e11892ea55f4f0 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnghuan@gmail.com>
Date: Tue, 2 Aug 2016 14:05:45 -0700
Subject: kexec: return error number directly

This is a cleanup patch to make kexec more clear to return error number
directly.  The variable result is useless, because there is no other
function's return value assignes to it.  So remove it.

Link: http://lkml.kernel.org/r/1464179273-57668-1-git-send-email-mnghuan@gmail.com
Signed-off-by: Minfei Huang <mnghuan@gmail.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Xunlei Pang <xlpang@redhat.com>
Cc: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 56b3ed0927b0..23311c803b1b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -147,7 +147,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 
 int sanity_check_segment_list(struct kimage *image)
 {
-	int result, i;
+	int i;
 	unsigned long nr_segments = image->nr_segments;
 
 	/*
@@ -163,16 +163,15 @@ int sanity_check_segment_list(struct kimage *image)
 	 * simply because addresses are changed to page size
 	 * granularity.
 	 */
-	result = -EADDRNOTAVAIL;
 	for (i = 0; i < nr_segments; i++) {
 		unsigned long mstart, mend;
 
 		mstart = image->segment[i].mem;
 		mend   = mstart + image->segment[i].memsz;
 		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
-			return result;
+			return -EADDRNOTAVAIL;
 		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-			return result;
+			return -EADDRNOTAVAIL;
 	}
 
 	/* Verify our destination addresses do not overlap.
@@ -180,7 +179,6 @@ int sanity_check_segment_list(struct kimage *image)
 	 * through very weird things can happen with no
 	 * easy explanation as one segment stops on another.
 	 */
-	result = -EINVAL;
 	for (i = 0; i < nr_segments; i++) {
 		unsigned long mstart, mend;
 		unsigned long j;
@@ -194,7 +192,7 @@ int sanity_check_segment_list(struct kimage *image)
 			pend   = pstart + image->segment[j].memsz;
 			/* Do the segments overlap ? */
 			if ((mend > pstart) && (mstart < pend))
-				return result;
+				return -EINVAL;
 		}
 	}
 
@@ -203,10 +201,9 @@ int sanity_check_segment_list(struct kimage *image)
 	 * and it is easier to check up front than to be surprised
 	 * later on.
 	 */
-	result = -EINVAL;
 	for (i = 0; i < nr_segments; i++) {
 		if (image->segment[i].bufsz > image->segment[i].memsz)
-			return result;
+			return -EINVAL;
 	}
 
 	/*
@@ -220,7 +217,6 @@ int sanity_check_segment_list(struct kimage *image)
 	 */
 
 	if (image->type == KEXEC_TYPE_CRASH) {
-		result = -EADDRNOTAVAIL;
 		for (i = 0; i < nr_segments; i++) {
 			unsigned long mstart, mend;
 
@@ -229,7 +225,7 @@ int sanity_check_segment_list(struct kimage *image)
 			/* Ensure we are within the crash kernel limits */
 			if ((mstart < crashk_res.start) ||
 			    (mend > crashk_res.end))
-				return result;
+				return -EADDRNOTAVAIL;
 		}
 	}
 
-- 
cgit v1.2.3


From 465d377701dfe6a08a9f361a3fd926dea7f89c74 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Tue, 2 Aug 2016 14:05:57 -0700
Subject: kexec: ensure user memory sizes do not wrap

Ensure that user memory sizes do not wrap around when validating the
user input, which can lead to the following input validation working
incorrectly.

[akpm@linux-foundation.org: fix it for kexec-return-error-number-directly.patch]
Link: http://lkml.kernel.org/r/E1b8koF-0004HM-5x@rmk-PC.armlinux.org.uk
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Reviewed-by: Pratyush Anand <panand@redhat.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Keerthy <j-keerthy@ti.com>
Cc: Vitaly Andrianov <vitalya@ti.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Simon Horman <horms@verge.net.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23311c803b1b..5a83b2a9d584 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -168,6 +168,8 @@ int sanity_check_segment_list(struct kimage *image)
 
 		mstart = image->segment[i].mem;
 		mend   = mstart + image->segment[i].memsz;
+		if (mstart > mend)
+			return -EADDRNOTAVAIL;
 		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 			return -EADDRNOTAVAIL;
 		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-- 
cgit v1.2.3


From dae28018f56645b61f5beb84d5831346d3c5e457 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Tue, 2 Aug 2016 14:06:00 -0700
Subject: kdump: arrange for paddr_vmcoreinfo_note() to return phys_addr_t

On PAE systems (eg, ARM LPAE) the vmcore note may be located above 4GB
physical on 32-bit architectures, so we need a wider type than "unsigned
long" here.  Arrange for paddr_vmcoreinfo_note() to return a
phys_addr_t, thereby allowing it to be located above 4GB.

This makes no difference for kexec-tools, as they already assume a
64-bit type when reading from this file.

Link: http://lkml.kernel.org/r/E1b8koK-0004HS-K9@rmk-PC.armlinux.org.uk
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Reviewed-by: Pratyush Anand <panand@redhat.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Keerthy <j-keerthy@ti.com>
Cc: Vitaly Andrianov <vitalya@ti.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Simon Horman <horms@verge.net.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 2 +-
 kernel/ksysfs.c     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 5a83b2a9d584..dab03f17be25 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1372,7 +1372,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
 void __weak arch_crash_save_vmcoreinfo(void)
 {}
 
-unsigned long __weak paddr_vmcoreinfo_note(void)
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
 {
 	return __pa((unsigned long)(char *)&vmcoreinfo_note);
 }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 152da4a48867..9f1920d2d0c6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -128,8 +128,8 @@ KERNEL_ATTR_RW(kexec_crash_size);
 static ssize_t vmcoreinfo_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%lx %x\n",
-		       paddr_vmcoreinfo_note(),
+	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
+	return sprintf(buf, "%pa %x\n", &vmcore_base,
 		       (unsigned int)sizeof(vmcoreinfo_note));
 }
 KERNEL_ATTR_RO(vmcoreinfo);
-- 
cgit v1.2.3


From 43546d8669d62d75fa69ca9a45d2f586665f56bd Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Tue, 2 Aug 2016 14:06:04 -0700
Subject: kexec: allow architectures to override boot mapping

kexec physical addresses are the boot-time view of the system.  For
certain ARM systems (such as Keystone 2), the boot view of the system
does not match the kernel's view of the system: the boot view uses a
special alias in the lower 4GB of the physical address space.

To cater for these kinds of setups, we need to translate between the
boot view physical addresses and the normal kernel view physical
addresses.  This patch extracts the current transation points into
linux/kexec.h, and allows an architecture to override the functions.

Due to the translations required, we unfortunately end up with six
translation functions, which are reduced down to four that the
architecture can override.

[akpm@linux-foundation.org: kexec.h needs asm/io.h for phys_to_virt()]
Link: http://lkml.kernel.org/r/E1b8koP-0004HZ-Vf@rmk-PC.armlinux.org.uk
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Keerthy <j-keerthy@ti.com>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Vitaly Andrianov <vitalya@ti.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Simon Horman <horms@verge.net.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c      |  3 ++-
 kernel/kexec_core.c | 26 +++++++++++++-------------
 2 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4384672d3245..980936a90ee6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -48,7 +48,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 
 	if (kexec_on_panic) {
 		/* Verify we have a valid entry point */
-		if ((entry < crashk_res.start) || (entry > crashk_res.end))
+		if ((entry < phys_to_boot_phys(crashk_res.start)) ||
+		    (entry > phys_to_boot_phys(crashk_res.end)))
 			return -EADDRNOTAVAIL;
 	}
 
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index dab03f17be25..73d4c5f57dd8 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -225,8 +225,8 @@ int sanity_check_segment_list(struct kimage *image)
 			mstart = image->segment[i].mem;
 			mend = mstart + image->segment[i].memsz - 1;
 			/* Ensure we are within the crash kernel limits */
-			if ((mstart < crashk_res.start) ||
-			    (mend > crashk_res.end))
+			if ((mstart < phys_to_boot_phys(crashk_res.start)) ||
+			    (mend > phys_to_boot_phys(crashk_res.end)))
 				return -EADDRNOTAVAIL;
 		}
 	}
@@ -350,7 +350,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 		pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
 		if (!pages)
 			break;
-		pfn   = page_to_pfn(pages);
+		pfn   = page_to_boot_pfn(pages);
 		epfn  = pfn + count;
 		addr  = pfn << PAGE_SHIFT;
 		eaddr = epfn << PAGE_SHIFT;
@@ -476,7 +476,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 			return -ENOMEM;
 
 		ind_page = page_address(page);
-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		*image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION;
 		image->entry = ind_page;
 		image->last_entry = ind_page +
 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -531,13 +531,13 @@ void kimage_terminate(struct kimage *image)
 #define for_each_kimage_entry(image, ptr, entry) \
 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 		ptr = (entry & IND_INDIRECTION) ? \
-			phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+			boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
 	struct page *page;
 
-	page = pfn_to_page(entry >> PAGE_SHIFT);
+	page = boot_pfn_to_page(entry >> PAGE_SHIFT);
 	kimage_free_pages(page);
 }
 
@@ -631,7 +631,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 	 * have a match.
 	 */
 	list_for_each_entry(page, &image->dest_pages, lru) {
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = page_to_boot_pfn(page) << PAGE_SHIFT;
 		if (addr == destination) {
 			list_del(&page->lru);
 			return page;
@@ -646,12 +646,12 @@ static struct page *kimage_alloc_page(struct kimage *image,
 		if (!page)
 			return NULL;
 		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) >
+		if (page_to_boot_pfn(page) >
 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 			list_add(&page->lru, &image->unusable_pages);
 			continue;
 		}
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = page_to_boot_pfn(page) << PAGE_SHIFT;
 
 		/* If it is the destination page we want use it */
 		if (addr == destination)
@@ -674,7 +674,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 			struct page *old_page;
 
 			old_addr = *old & PAGE_MASK;
-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT);
 			copy_highpage(page, old_page);
 			*old = addr | (*old & ~PAGE_MASK);
 
@@ -730,7 +730,7 @@ static int kimage_load_normal_segment(struct kimage *image,
 			result  = -ENOMEM;
 			goto out;
 		}
-		result = kimage_add_page(image, page_to_pfn(page)
+		result = kimage_add_page(image, page_to_boot_pfn(page)
 								<< PAGE_SHIFT);
 		if (result < 0)
 			goto out;
@@ -791,7 +791,7 @@ static int kimage_load_crash_segment(struct kimage *image,
 		char *ptr;
 		size_t uchunk, mchunk;
 
-		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		page = boot_pfn_to_page(maddr >> PAGE_SHIFT);
 		if (!page) {
 			result  = -ENOMEM;
 			goto out;
@@ -919,7 +919,7 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
 	unsigned long addr;
 
 	for (addr = begin; addr < end; addr += PAGE_SIZE)
-		free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+		free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT));
 }
 
 int crash_shrink_memory(unsigned long new_size)
-- 
cgit v1.2.3


From b26e27ddfd2a986dc53e259aba572f3aac182eb8 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Tue, 2 Aug 2016 14:06:13 -0700
Subject: kexec: use core_param for crash_kexec_post_notifiers boot option

crash_kexec_post_notifiers ia a boot option which controls whether the
1st kernel calls panic notifiers or not before booting the 2nd kernel.
However, there is no need to limit it to being modifiable only at boot
time.  So, use core_param instead of early_param.

Link: http://lkml.kernel.org/r/20160705113327.5864.43139.stgit@softrs
Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 8aa74497cc5a..ca8cea1ef673 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -108,6 +108,7 @@ void panic(const char *fmt, ...)
 	long i, i_next = 0;
 	int state = 0;
 	int old_cpu, this_cpu;
+	bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
 
 	/*
 	 * Disable local interrupts. This will prevent panic_smp_self_stop
@@ -160,7 +161,7 @@ void panic(const char *fmt, ...)
 	 *
 	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
-	if (!crash_kexec_post_notifiers) {
+	if (!_crash_kexec_post_notifiers) {
 		printk_nmi_flush_on_panic();
 		__crash_kexec(NULL);
 	}
@@ -191,7 +192,7 @@ void panic(const char *fmt, ...)
 	 *
 	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
-	if (crash_kexec_post_notifiers)
+	if (_crash_kexec_post_notifiers)
 		__crash_kexec(NULL);
 
 	bust_spinlocks(0);
@@ -571,13 +572,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
 core_param(panic_on_warn, panic_on_warn, int, 0644);
-
-static int __init setup_crash_kexec_post_notifiers(char *s)
-{
-	crash_kexec_post_notifiers = true;
-	return 0;
-}
-early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
+core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
 
 static int __init oops_setup(char *s)
 {
-- 
cgit v1.2.3


From 21db79e8bb054d0351a6b1b464f1c9c47a2e6e8d Mon Sep 17 00:00:00 2001
From: Petr Tesarik <ptesarik@suse.com>
Date: Tue, 2 Aug 2016 14:06:16 -0700
Subject: kexec: add a kexec_crash_loaded() function

Provide a wrapper function to be used by kernel code to check whether a
crash kernel is loaded.  It returns the same value that can be seen in
/sys/kernel/kexec_crash_loaded by userspace programs.

I'm exporting the function, because it will be used by Xen, and it is
possible to compile Xen modules separately to enable the use of PV
drivers with unmodified bare-metal kernels.

Link: http://lkml.kernel.org/r/20160713121955.14969.69080.stgit@hananiah.suse.cz
Signed-off-by: Petr Tesarik <ptesarik@suse.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 6 ++++++
 kernel/ksysfs.c     | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 73d4c5f57dd8..704534029a00 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -95,6 +95,12 @@ int kexec_should_crash(struct task_struct *p)
 	return 0;
 }
 
+int kexec_crash_loaded(void)
+{
+	return !!kexec_crash_image;
+}
+EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+
 /*
  * When kexec transitions to the new kernel there is a one-to-one
  * mapping between physical and virtual addresses.  On processors
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 9f1920d2d0c6..ee1bc1bb8feb 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -101,7 +101,7 @@ KERNEL_ATTR_RO(kexec_loaded);
 static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
 				       struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", !!kexec_crash_image);
+	return sprintf(buf, "%d\n", kexec_crash_loaded());
 }
 KERNEL_ATTR_RO(kexec_crash_loaded);
 
-- 
cgit v1.2.3


From 1730f146604ea426e54938cdbcf87df1047ef0dc Mon Sep 17 00:00:00 2001
From: zhong jiang <zhongjiang@huawei.com>
Date: Tue, 2 Aug 2016 14:06:22 -0700
Subject: kexec: add restriction on kexec_load() segment sizes

I hit the following issue when run trinity in my system.  The kernel is
3.4 version, but mainline has the same issue.

The root cause is that the segment size is too large so the kerenl
spends too long trying to allocate a page.  Other cases will block until
the test case quits.  Also, OOM conditions will occur.

Call Trace:
  __alloc_pages_nodemask+0x14c/0x8f0
  alloc_pages_current+0xaf/0x120
  kimage_alloc_pages+0x10/0x60
  kimage_alloc_control_pages+0x5d/0x270
  machine_kexec_prepare+0xe5/0x6c0
  ? kimage_free_page_list+0x52/0x70
  sys_kexec_load+0x141/0x600
  ? vfs_write+0x100/0x180
  system_call_fastpath+0x16/0x1b

The patch changes sanity_check_segment_list() to verify that the usage by
all segments does not exceed half of memory.

[akpm@linux-foundation.org: fix for kexec-return-error-number-directly.patch, update comment]
Link: http://lkml.kernel.org/r/1469625474-53904-1-git-send-email-zhongjiang@huawei.com
Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 704534029a00..561675589511 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -146,6 +146,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded);
  * allocating pages whose destination address we do not care about.
  */
 #define KIMAGE_NO_DEST (-1UL)
+#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
 
 static struct page *kimage_alloc_page(struct kimage *image,
 				       gfp_t gfp_mask,
@@ -155,6 +156,7 @@ int sanity_check_segment_list(struct kimage *image)
 {
 	int i;
 	unsigned long nr_segments = image->nr_segments;
+	unsigned long total_pages = 0;
 
 	/*
 	 * Verify we have good destination addresses.  The caller is
@@ -214,6 +216,21 @@ int sanity_check_segment_list(struct kimage *image)
 			return -EINVAL;
 	}
 
+	/*
+	 * Verify that no more than half of memory will be consumed. If the
+	 * request from userspace is too large, a large amount of time will be
+	 * wasted allocating pages, which can cause a soft lockup.
+	 */
+	for (i = 0; i < nr_segments; i++) {
+		if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2)
+			return -EINVAL;
+
+		total_pages += PAGE_COUNT(image->segment[i].memsz);
+	}
+
+	if (total_pages > totalram_pages / 2)
+		return -EINVAL;
+
 	/*
 	 * Verify we have good destination addresses.  Normally
 	 * the caller is responsible for making certain we don't
-- 
cgit v1.2.3


From 59dbb2a06fc2bcb752b964e036884fe9bb9dbbe0 Mon Sep 17 00:00:00 2001
From: Akash Goel <akash.goel@intel.com>
Date: Tue, 2 Aug 2016 14:07:18 -0700
Subject: relay: add global mode support for buffer-only channels

Commit 20d8b67c06fa ("relay: add buffer-only channels; useful for early
logging") added support to use channels with no associated files.

This is useful when the exact location of relay file is not known or the
the parent directory of relay file is not available, while creating the
channel and the logging has to start right from the boot.

But there was no provision to use global mode with buffer-only channels,
which is added by this patch, without modifying the interface where
initially there will be a dummy invocation of create_buf_file callback
through which kernel client can convey the need of a global buffer.

For the use case where drivers/kernel clients want a simple interface
for the userspace, which enables them to capture data/logs from relay
file inorder & without any post processing, support of Global buffer
mode is warranted.

Modules, like i915, using relay_open() in early init would have to later
register their buffer-only relays, once debugfs is available, by calling
relay_late_setup_files().  Hence relay_late_setup_files() symbol also
needs to be exported.

Link: http://lkml.kernel.org/r/1468404563-11653-1-git-send-email-akash.goel@intel.com
Signed-off-by: Akash Goel <akash.goel@intel.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 04d7cf3ef8cf..d797502140b9 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -451,6 +451,13 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
 		if (!dentry)
 			goto free_buf;
 		relay_set_buf_dentry(buf, dentry);
+	} else {
+		/* Only retrieve global info, nothing more, nothing less */
+		dentry = chan->cb->create_buf_file(NULL, NULL,
+						   S_IRUSR, buf,
+						   &chan->is_global);
+		if (WARN_ON(dentry))
+			goto free_buf;
 	}
 
  	buf->cpu = cpu;
@@ -562,6 +569,10 @@ static int relay_hotcpu_callback(struct notifier_block *nb,
  *	attributes specified.  The created channel buffer files
  *	will be named base_filename0...base_filenameN-1.  File
  *	permissions will be %S_IRUSR.
+ *
+ *	If opening a buffer (@parent = NULL) that you later wish to register
+ *	in a filesystem, call relay_late_setup_files() once the @parent dentry
+ *	is available.
  */
 struct rchan *relay_open(const char *base_filename,
 			 struct dentry *parent,
@@ -640,8 +651,12 @@ static void __relay_set_buf_dentry(void *info)
  *
  *	Returns 0 if successful, non-zero otherwise.
  *
- *	Use to setup files for a previously buffer-only channel.
- *	Useful to do early tracing in kernel, before VFS is up, for example.
+ *	Use to setup files for a previously buffer-only channel created
+ *	by relay_open() with a NULL parent dentry.
+ *
+ *	For example, this is useful for perfomring early tracing in kernel,
+ *	before VFS is up and then exposing the early results once the dentry
+ *	is available.
  */
 int relay_late_setup_files(struct rchan *chan,
 			   const char *base_filename,
@@ -666,6 +681,20 @@ int relay_late_setup_files(struct rchan *chan,
 	}
 	chan->has_base_filename = 1;
 	chan->parent = parent;
+
+	if (chan->is_global) {
+		err = -EINVAL;
+		if (!WARN_ON_ONCE(!chan->buf[0])) {
+			dentry = relay_create_buf_file(chan, chan->buf[0], 0);
+			if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
+				relay_set_buf_dentry(chan->buf[0], dentry);
+				err = 0;
+			}
+		}
+		mutex_unlock(&relay_channels_mutex);
+		return err;
+	}
+
 	curr_cpu = get_cpu();
 	/*
 	 * The CPU hotplug notifier ran before us and created buffers with
@@ -706,6 +735,7 @@ int relay_late_setup_files(struct rchan *chan,
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(relay_late_setup_files);
 
 /**
  *	relay_switch_subbuf - switch to a new sub-buffer
-- 
cgit v1.2.3


From 27eb6622ab67bad75814c9b7b08096cfb16be63a Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 2 Aug 2016 14:07:24 -0700
Subject: config: add android config fragments

Copy the config fragments from the AOSP common kernel android-4.4
branch.  It is becoming possible to run mainline kernels with Android,
but the kernel defconfigs don't work as-is and debugging missing config
options is a pain.  Adding the config fragments into the kernel tree,
makes configuring a mainline kernel as simple as:

  make ARCH=arm multi_v7_defconfig android-base.config android-recommended.config

The following non-upstream config options were removed:

  CONFIG_NETFILTER_XT_MATCH_QTAGUID
  CONFIG_NETFILTER_XT_MATCH_QUOTA2
  CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
  CONFIG_PPPOLAC
  CONFIG_PPPOPNS
  CONFIG_SECURITY_PERF_EVENTS_RESTRICT
  CONFIG_USB_CONFIGFS_F_MTP
  CONFIG_USB_CONFIGFS_F_PTP
  CONFIG_USB_CONFIGFS_F_ACC
  CONFIG_USB_CONFIGFS_F_AUDIO_SRC
  CONFIG_USB_CONFIGFS_UEVENT
  CONFIG_INPUT_KEYCHORD
  CONFIG_INPUT_KEYRESET

Link: http://lkml.kernel.org/r/1466708235-28593-1-git-send-email-robh@kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Cc: Amit Pundir <amit.pundir@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Dmitry Shmidt <dimitrysh@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/configs/android-base.config        | 152 ++++++++++++++++++++++++++++++
 kernel/configs/android-recommended.config | 121 ++++++++++++++++++++++++
 2 files changed, 273 insertions(+)
 create mode 100644 kernel/configs/android-base.config
 create mode 100644 kernel/configs/android-recommended.config

(limited to 'kernel')

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
new file mode 100644
index 000000000000..9f748ed7bea8
--- /dev/null
+++ b/kernel/configs/android-base.config
@@ -0,0 +1,152 @@
+#  KEEP ALPHABETICALLY SORTED
+# CONFIG_DEVKMEM is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_MODULES is not set
+# CONFIG_OABI_COMPAT is not set
+# CONFIG_SYSVIPC is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_ASHMEM=y
+CONFIG_AUDIT=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_EMBEDDED=y
+CONFIG_FB=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_INET_ESP=y
+CONFIG_INET_XFRM_MODE_TUNNEL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IPV6=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_TPROXY=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_KEY=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_NAT=y
+CONFIG_NO_HZ=y
+CONFIG_PACKET=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PREEMPT=y
+CONFIG_QUOTA=y
+CONFIG_RTC_CLASS=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_STAGING=y
+CONFIG_SWP_EMULATION=y
+CONFIG_SYNC=y
+CONFIG_TUN=y
+CONFIG_UNIX=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
new file mode 100644
index 000000000000..e3b953e966d2
--- /dev/null
+++ b/kernel/configs/android-recommended.config
@@ -0,0 +1,121 @@
+#  KEEP ALPHABETICALLY SORTED
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_NF_CONNTRACK_SIP is not set
+# CONFIG_PM_WAKELOCKS_GC is not set
+# CONFIG_VT is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_COMPACTION=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_DM_UEVENT=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_FUSE_FS=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HIDRAW=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GREENASIA=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_GPIO=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_ION=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KSM=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGITECH_FF=y
+CONFIG_MD=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MSDOS_FS=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_PANTHERLORD_FF=y
+CONFIG_PERF_EVENTS=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_RUNTIME=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+CONFIG_POWER_SUPPLY=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_SND=y
+CONFIG_SOUND=y
+CONFIG_SUSPEND_TIME=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_TASK_XACCT=y
+CONFIG_TIMER_STATS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_UHID=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_USBNET=y
+CONFIG_VFAT_FS=y
-- 
cgit v1.2.3


From 9502514f2808d29f6f2afa1c410e7808898dede1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 19 Jul 2016 05:59:24 +0930
Subject: module: Do a WARN_ON_ONCE() for assert module mutex not held

When running with lockdep enabled, I triggered the WARN_ON() in the
module code that asserts when module_mutex or rcu_read_lock_sched are
not held. The issue I have is that this can also be called from the
dump_stack() code, causing us to enter an infinite loop...

 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e
 Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14
 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
  ffff880215e8fa70 ffff880215e8fa70 ffffffff812fc8e3 0000000000000000
  ffffffff81d3e55b ffff880215e8fac0 ffffffff8104fc88 ffffffff8104fcab
  0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001
 Call Trace:
  [<ffffffff812fc8e3>] dump_stack+0x67/0x90
  [<ffffffff8104fc88>] __warn+0xcb/0xe9
  [<ffffffff8104fcab>] ? warn_slowpath_null+0x5/0x1f
 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e
 Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14
 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
  ffff880215e8f7a0 ffff880215e8f7a0 ffffffff812fc8e3 0000000000000000
  ffffffff81d3e55b ffff880215e8f7f0 ffffffff8104fc88 ffffffff8104fcab
  0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001
 Call Trace:
  [<ffffffff812fc8e3>] dump_stack+0x67/0x90
  [<ffffffff8104fc88>] __warn+0xcb/0xe9
  [<ffffffff8104fcab>] ? warn_slowpath_null+0x5/0x1f
 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e
 Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14
 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
  ffff880215e8f4d0 ffff880215e8f4d0 ffffffff812fc8e3 0000000000000000
  ffffffff81d3e55b ffff880215e8f520 ffffffff8104fc88 ffffffff8104fcab
  0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001
 Call Trace:
  [<ffffffff812fc8e3>] dump_stack+0x67/0x90
  [<ffffffff8104fc88>] __warn+0xcb/0xe9
  [<ffffffff8104fcab>] ? warn_slowpath_null+0x5/0x1f
 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e
[...]

Which gives us rather useless information. Worse yet, there's some race
that causes this, and I seldom trigger it, so I have no idea what
happened.

This would not be an issue if that warning was a WARN_ON_ONCE().

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 7f21ab238aa7..beaebea627ff 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -264,7 +264,7 @@ static void module_assert_mutex_or_preempt(void)
 	if (unlikely(!debug_locks))
 		return;
 
-	WARN_ON(!rcu_read_lock_sched_held() &&
+	WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
 		!lockdep_is_held(&module_mutex));
 #endif
 }
-- 
cgit v1.2.3


From be7de5f91fdc3a63ee01910c43f20db213445ce4 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Thu, 21 Jul 2016 15:37:56 +0930
Subject: modules: Add kernel parameter to blacklist modules

Blacklisting a module in linux has long been a problem.  The current
procedure is to use rd.blacklist=module_name, however, that doesn't
cover the case after the initramfs and before a boot prompt (where one
is supposed to use /etc/modprobe.d/blacklist.conf to blacklist
runtime loading). Using rd.shell to get an early prompt is hit-or-miss,
and doesn't cover all situations AFAICT.

This patch adds this functionality of permanently blacklisting a module
by its name via the kernel parameter module_blacklist=module_name.

[v2]: Rusty, use core_param() instead of __setup() which simplifies
things.

[v3]: Rusty, undo wreckage from strsep()

[v4]: Rusty, simpler version of blacklisted()

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-doc@vger.kernel.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index beaebea627ff..c91c2fdca2e6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3168,6 +3168,27 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
 	return 0;
 }
 
+/* module_blacklist is a comma-separated list of module names */
+static char *module_blacklist;
+static bool blacklisted(char *module_name)
+{
+	const char *p;
+	size_t len;
+
+	if (!module_blacklist)
+		return false;
+
+	for (p = module_blacklist; *p; p += len) {
+		len = strcspn(p, ",");
+		if (strlen(module_name) == len && !memcmp(module_name, p, len))
+			return true;
+		if (p[len] == ',')
+			len++;
+	}
+	return false;
+}
+core_param(module_blacklist, module_blacklist, charp, 0400);
+
 static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
 	/* Module within temporary copy. */
@@ -3178,6 +3199,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 	if (IS_ERR(mod))
 		return mod;
 
+	if (blacklisted(mod->name))
+		return ERR_PTR(-EPERM);
+
 	err = check_modinfo(mod, info, flags);
 	if (err)
 		return ERR_PTR(err);
-- 
cgit v1.2.3


From bdc9f373551dd3f1e6fae1618f2394ee9bc7db2e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 27 Jul 2016 12:17:35 +0930
Subject: jump_label: disable preemption around __module_text_address().

Steven reported a warning caused by not holding module_mutex or
rcu_read_lock_sched: his backtrace was corrupted but a quick audit
found this possible cause.  It's wrong anyway...

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/jump_label.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0dbea887d625..0eef93962a91 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -284,11 +284,14 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
 {
 	struct module *mod;
 
+	preempt_disable();
 	mod = __module_text_address((unsigned long)start);
+	WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+	preempt_enable();
+
 	if (!mod)
 		return 0;
 
-	WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
 
 	return __jump_label_text_reserved(mod->jump_entries,
 				mod->jump_entries + mod->num_jump_entries,
-- 
cgit v1.2.3


From 444d13ff10fb13bc3e64859c3cf9ce43dcfeb075 Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@redhat.com>
Date: Wed, 27 Jul 2016 12:06:21 +0930
Subject: modules: add ro_after_init support

Add ro_after_init support for modules by adding a new page-aligned section
in the module layout (after rodata) for ro_after_init data and enabling RO
protection for that section after module init runs.

Signed-off-by: Jessica Yu <jeyu@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/livepatch/core.c |  2 +-
 kernel/module.c         | 66 ++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 55 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 5c2bc1052691..8bbe50704621 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -309,7 +309,7 @@ static int klp_write_object_relocations(struct module *pmod,
 			break;
 	}
 
-	module_enable_ro(pmod);
+	module_enable_ro(pmod, true);
 	return ret;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index c91c2fdca2e6..205a71a97852 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1857,10 +1857,11 @@ static void mod_sysfs_teardown(struct module *mod)
  * from modification and any data from execution.
  *
  * General layout of module is:
- *          [text] [read-only-data] [writable data]
- * text_size -----^                ^               ^
- * ro_size ------------------------|               |
- * size -------------------------------------------|
+ *          [text] [read-only-data] [ro-after-init] [writable data]
+ * text_size -----^                ^               ^               ^
+ * ro_size ------------------------|               |               |
+ * ro_after_init_size -----------------------------|               |
+ * size -----------------------------------------------------------|
  *
  * These values are always page-aligned (as is base)
  */
@@ -1883,14 +1884,24 @@ static void frob_rodata(const struct module_layout *layout,
 		   (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
 }
 
+static void frob_ro_after_init(const struct module_layout *layout,
+				int (*set_memory)(unsigned long start, int num_pages))
+{
+	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
+	set_memory((unsigned long)layout->base + layout->ro_size,
+		   (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
+}
+
 static void frob_writable_data(const struct module_layout *layout,
 			       int (*set_memory)(unsigned long start, int num_pages))
 {
 	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
-	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
 	BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
-	set_memory((unsigned long)layout->base + layout->ro_size,
-		   (layout->size - layout->ro_size) >> PAGE_SHIFT);
+	set_memory((unsigned long)layout->base + layout->ro_after_init_size,
+		   (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
 }
 
 /* livepatching wants to disable read-only so it can frob module. */
@@ -1898,21 +1909,26 @@ void module_disable_ro(const struct module *mod)
 {
 	frob_text(&mod->core_layout, set_memory_rw);
 	frob_rodata(&mod->core_layout, set_memory_rw);
+	frob_ro_after_init(&mod->core_layout, set_memory_rw);
 	frob_text(&mod->init_layout, set_memory_rw);
 	frob_rodata(&mod->init_layout, set_memory_rw);
 }
 
-void module_enable_ro(const struct module *mod)
+void module_enable_ro(const struct module *mod, bool after_init)
 {
 	frob_text(&mod->core_layout, set_memory_ro);
 	frob_rodata(&mod->core_layout, set_memory_ro);
 	frob_text(&mod->init_layout, set_memory_ro);
 	frob_rodata(&mod->init_layout, set_memory_ro);
+
+	if (after_init)
+		frob_ro_after_init(&mod->core_layout, set_memory_ro);
 }
 
 static void module_enable_nx(const struct module *mod)
 {
 	frob_rodata(&mod->core_layout, set_memory_nx);
+	frob_ro_after_init(&mod->core_layout, set_memory_nx);
 	frob_writable_data(&mod->core_layout, set_memory_nx);
 	frob_rodata(&mod->init_layout, set_memory_nx);
 	frob_writable_data(&mod->init_layout, set_memory_nx);
@@ -1921,6 +1937,7 @@ static void module_enable_nx(const struct module *mod)
 static void module_disable_nx(const struct module *mod)
 {
 	frob_rodata(&mod->core_layout, set_memory_x);
+	frob_ro_after_init(&mod->core_layout, set_memory_x);
 	frob_writable_data(&mod->core_layout, set_memory_x);
 	frob_rodata(&mod->init_layout, set_memory_x);
 	frob_writable_data(&mod->init_layout, set_memory_x);
@@ -1963,6 +1980,8 @@ static void disable_ro_nx(const struct module_layout *layout)
 	frob_text(layout, set_memory_rw);
 	frob_rodata(layout, set_memory_rw);
 	frob_rodata(layout, set_memory_x);
+	frob_ro_after_init(layout, set_memory_rw);
+	frob_ro_after_init(layout, set_memory_x);
 	frob_writable_data(layout, set_memory_x);
 }
 
@@ -2305,6 +2324,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
 		 * finder in the two loops below */
 		{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
 		{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+		{ SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
 		{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
 		{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
 	};
@@ -2336,7 +2356,11 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			mod->core_layout.size = debug_align(mod->core_layout.size);
 			mod->core_layout.ro_size = mod->core_layout.size;
 			break;
-		case 3: /* whole core */
+		case 2: /* RO after init */
+			mod->core_layout.size = debug_align(mod->core_layout.size);
+			mod->core_layout.ro_after_init_size = mod->core_layout.size;
+			break;
+		case 4: /* whole core */
 			mod->core_layout.size = debug_align(mod->core_layout.size);
 			break;
 		}
@@ -2366,7 +2390,14 @@ static void layout_sections(struct module *mod, struct load_info *info)
 			mod->init_layout.size = debug_align(mod->init_layout.size);
 			mod->init_layout.ro_size = mod->init_layout.size;
 			break;
-		case 3: /* whole init */
+		case 2:
+			/*
+			 * RO after init doesn't apply to init_layout (only
+			 * core_layout), so it just takes the value of ro_size.
+			 */
+			mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
+			break;
+		case 4: /* whole init */
 			mod->init_layout.size = debug_align(mod->init_layout.size);
 			break;
 		}
@@ -3193,6 +3224,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
 	/* Module within temporary copy. */
 	struct module *mod;
+	unsigned int ndx;
 	int err;
 
 	mod = setup_load_info(info, flags);
@@ -3215,6 +3247,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 	/* We will do a special allocation for per-cpu sections later. */
 	info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
 
+	/*
+	 * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+	 * layout_sections() can put it in the right place.
+	 * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+	 */
+	ndx = find_sec(info, ".data..ro_after_init");
+	if (ndx)
+		info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+
 	/* Determine total sizes, and put offsets in sh_entsize.  For now
 	   this is done generically; there doesn't appear to be any
 	   special cases for the architectures. */
@@ -3381,12 +3422,14 @@ static noinline int do_init_module(struct module *mod)
 	/* Switch to core kallsyms now init is done: kallsyms may be walking! */
 	rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
 #endif
+	module_enable_ro(mod, true);
 	mod_tree_remove_init(mod);
 	disable_ro_nx(&mod->init_layout);
 	module_arch_freeing_init(mod);
 	mod->init_layout.base = NULL;
 	mod->init_layout.size = 0;
 	mod->init_layout.ro_size = 0;
+	mod->init_layout.ro_after_init_size = 0;
 	mod->init_layout.text_size = 0;
 	/*
 	 * We want to free module_init, but be aware that kallsyms may be
@@ -3478,8 +3521,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
 	/* This relies on module_mutex for list integrity. */
 	module_bug_finalize(info->hdr, info->sechdrs, mod);
 
-	/* Set RO and NX regions */
-	module_enable_ro(mod);
+	module_enable_ro(mod, false);
 	module_enable_nx(mod);
 
 	/* Mark state as coming so strong_try_module_get() ignores us,
-- 
cgit v1.2.3


From 97f2645f358b411ba2afb22e5966753f0ad92916 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Wed, 3 Aug 2016 13:45:50 -0700
Subject: tree-wide: replace config_enabled() with IS_ENABLED()

The use of config_enabled() against config options is ambiguous.  In
practical terms, config_enabled() is equivalent to IS_BUILTIN(), but the
author might have used it for the meaning of IS_ENABLED().  Using
IS_ENABLED(), IS_BUILTIN(), IS_MODULE() etc.  makes the intention
clearer.

This commit replaces config_enabled() with IS_ENABLED() where possible.
This commit is only touching bool config options.

I noticed two cases where config_enabled() is used against a tristate
option:

 - config_enabled(CONFIG_HWMON)
  [ drivers/net/wireless/ath/ath10k/thermal.c ]

 - config_enabled(CONFIG_BACKLIGHT_CLASS_DEVICE)
  [ drivers/gpu/drm/gma500/opregion.c ]

I did not touch them because they should be converted to IS_BUILTIN()
in order to keep the logic, but I was not sure it was the authors'
intention.

Link: http://lkml.kernel.org/r/1465215656-20569-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Stas Sergeev <stsp@list.ru>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Joshua Kinard <kumba@gentoo.org>
Cc: Jiri Slaby <jslaby@suse.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Markos Chandras <markos.chandras@imgtec.com>
Cc: "Dmitry V. Levin" <ldv@altlinux.org>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Will Drewry <wad@chromium.org>
Cc: Nikolay Martynov <mar.kolya@gmail.com>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
Cc: Rafal Milecki <zajec5@gmail.com>
Cc: James Cowgill <James.Cowgill@imgtec.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Alex Smith <alex.smith@imgtec.com>
Cc: Adam Buchbinder <adam.buchbinder@gmail.com>
Cc: Qais Yousef <qais.yousef@imgtec.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: "Luis R. Rodriguez" <mcgrof@do-not-panic.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Tony Wu <tung7970@gmail.com>
Cc: Huaitong Han <huaitong.han@intel.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Gelmini <andrea.gelmini@gelma.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Rabin Vincent <rabin@rab.in>
Cc: "Maciej W. Rozycki" <macro@imgtec.com>
Cc: David Daney <david.daney@cavium.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c  | 4 ++--
 kernel/seccomp.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d49bfa1e53e6..1d3b7665d0be 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -585,8 +585,8 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 		return -EINVAL;
 
 	if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
-		if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
-		    !config_enabled(CONFIG_SECCOMP))
+		if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+		    !IS_ENABLED(CONFIG_SECCOMP))
 			return -EINVAL;
 
 		if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 54d15eb2b701..ef6c6c3f9d8a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -347,7 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
 	struct seccomp_filter *sfilter;
 	int ret;
-	const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
+	const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 		return ERR_PTR(-EINVAL);
@@ -542,7 +542,7 @@ void secure_computing_strict(int this_syscall)
 {
 	int mode = current->seccomp.mode;
 
-	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
 		return;
 
@@ -655,7 +655,7 @@ int __secure_computing(const struct seccomp_data *sd)
 	int mode = current->seccomp.mode;
 	int this_syscall;
 
-	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
 	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
 		return 0;
 
-- 
cgit v1.2.3


From 1f69bf9c6137602cd028c96b4f8329121ec89231 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Wed, 3 Aug 2016 13:46:36 -0700
Subject: jump_label: remove bug.h, atomic.h dependencies for HAVE_JUMP_LABEL

The current jump_label.h includes bug.h for things such as WARN_ON().
This makes the header problematic for inclusion by kernel.h or any
headers that kernel.h includes, since bug.h includes kernel.h (circular
dependency).  The inclusion of atomic.h is similarly problematic.  Thus,
this should make jump_label.h 'includable' from most places.

Link: http://lkml.kernel.org/r/7060ce35ddd0d20b33bf170685e6b0fab816bdf2.1467837322.git.jbaron@akamai.com
Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Joe Perches <joe@perches.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/jump_label.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0dbea887d625..f19aa02a8f48 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/static_key.h>
 #include <linux/jump_label_ratelimit.h>
+#include <linux/bug.h>
 
 #ifdef HAVE_JUMP_LABEL
 
@@ -56,6 +57,49 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
 
 static void jump_label_update(struct static_key *key);
 
+/*
+ * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h.
+ * The use of 'atomic_read()' requires atomic.h and its problematic for some
+ * kernel headers such as kernel.h and others. Since static_key_count() is not
+ * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok
+ * to have it be a function here. Similarly, for 'static_key_enable()' and
+ * 'static_key_disable()', which require bug.h. This should allow jump_label.h
+ * to be included from most/all places for HAVE_JUMP_LABEL.
+ */
+int static_key_count(struct static_key *key)
+{
+	/*
+	 * -1 means the first static_key_slow_inc() is in progress.
+	 *  static_key_enabled() must return true, so return 1 here.
+	 */
+	int n = atomic_read(&key->enabled);
+
+	return n >= 0 ? n : 1;
+}
+EXPORT_SYMBOL_GPL(static_key_count);
+
+void static_key_enable(struct static_key *key)
+{
+	int count = static_key_count(key);
+
+	WARN_ON_ONCE(count < 0 || count > 1);
+
+	if (!count)
+		static_key_slow_inc(key);
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
+
+void static_key_disable(struct static_key *key)
+{
+	int count = static_key_count(key);
+
+	WARN_ON_ONCE(count < 0 || count > 1);
+
+	if (count)
+		static_key_slow_dec(key);
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+
 void static_key_slow_inc(struct static_key *key)
 {
 	int v, v1;
@@ -235,6 +279,15 @@ void __init jump_label_init(void)
 	struct static_key *key = NULL;
 	struct jump_entry *iter;
 
+	/*
+	 * Since we are initializing the static_key.enabled field with
+	 * with the 'raw' int values (to avoid pulling in atomic.h) in
+	 * jump_label.h, let's make sure that is safe. There are only two
+	 * cases to check since we initialize to 0 or 1.
+	 */
+	BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0);
+	BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1);
+
 	jump_label_lock();
 	jump_label_sort_entries(iter_start, iter_stop);
 
-- 
cgit v1.2.3


From 1eff9d322a444245c67515edb52bc0eb68374aa8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 5 Aug 2016 15:35:16 -0600
Subject: block: rename bio bi_rw to bi_opf

Since commit 63a4cc24867d, bio->bi_rw contains flags in the lower
portion and the op code in the higher portions. This means that
old code that relies on manually setting bi_rw is most likely
going to be broken. Instead of letting that brokeness linger,
rename the member, to force old and out-of-tree code to break
at compile time instead of at runtime.

No intended functional changes in this commit.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb345cd11883..7598e6ca817a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -776,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 		return;
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio_op(bio), bio->bi_rw, what, error, 0, NULL);
+			bio_op(bio), bio->bi_opf, what, error, 0, NULL);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
@@ -881,7 +881,7 @@ static void blk_add_trace_split(void *ignore,
 		__be64 rpdu = cpu_to_be64(pdu);
 
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
-				bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw,
+				bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
 				BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
 				&rpdu);
 	}
@@ -915,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+			bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error,
 			sizeof(r), &r);
 }
 
-- 
cgit v1.2.3


From 574673c231a5fad1560249cc3a598907acb36cf9 Mon Sep 17 00:00:00 2001
From: Andreas Ziegler <andreas.ziegler@fau.de>
Date: Thu, 4 Aug 2016 09:52:09 +0200
Subject: printk: Remove unnecessary #ifdef CONFIG_PRINTK

In commit 874f9c7da9a4 ("printk: create pr_<level> functions"), new
pr_level defines were added to printk.c.

These new defines are guarded by an #ifdef CONFIG_PRINTK - however,
there is already a surrounding #ifdef CONFIG_PRINTK starting a lot
earlier in line 249 which means the newly introduced #ifdef is
unnecessary.

Let's remove it to avoid confusion.

Signed-off-by: Andreas Ziegler <andreas.ziegler@fau.de>
Cc: Joe Perches <joe@perches.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a5ef95ca18c9..a37fc8cf8e84 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1930,7 +1930,6 @@ asmlinkage int printk_emit(int facility, int level,
 }
 EXPORT_SYMBOL(printk_emit);
 
-#ifdef CONFIG_PRINTK
 #define define_pr_level(func, loglevel)				\
 asmlinkage __visible void func(const char *fmt, ...)		\
 {								\
@@ -1949,7 +1948,6 @@ define_pr_level(__pr_err, LOGLEVEL_ERR);
 define_pr_level(__pr_warn, LOGLEVEL_WARNING);
 define_pr_level(__pr_notice, LOGLEVEL_NOTICE);
 define_pr_level(__pr_info, LOGLEVEL_INFO);
-#endif
 
 int vprintk_default(int level, const char *fmt, va_list args)
 {
-- 
cgit v1.2.3


From a0cba2179ea4c1820fce2ee046b6ed90ecc56196 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 9 Aug 2016 10:48:18 -0700
Subject: Revert "printk: create pr_<level> functions"

This reverts commit 874f9c7da9a4acbc1b9e12ca722579fb50e4d142.

Geert Uytterhoeven reports:
 "This change seems to have an (unintendent?) side-effect.

  Before, pr_*() calls without a trailing newline characters would be
  printed with a newline character appended, both on the console and in
  the output of the dmesg command.

  After this commit, no new line character is appended, and the output
  of the next pr_*() call of the same type may be appended, like in:

    - Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000
    - Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)
    + Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)"

Joe Perches says:
 "No, that is not intentional.

  The newline handling code inside vprintk_emit is a bit involved and
  for now I suggest a revert until this has all the same behavior as
  earlier"

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Requested-by: Joe Perches <joe@perches.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/internal.h | 16 ++++++----------
 kernel/printk/nmi.c      | 13 ++-----------
 kernel/printk/printk.c   | 25 +++----------------------
 3 files changed, 11 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 5d4505f30083..7fd2838fa417 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,11 +16,9 @@
  */
 #include <linux/percpu.h>
 
-typedef __printf(2, 0) int (*printk_func_t)(int level, const char *fmt,
-					    va_list args);
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
 
-__printf(2, 0)
-int vprintk_default(int level, const char *fmt, va_list args);
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK_NMI
 
@@ -33,10 +31,9 @@ extern raw_spinlock_t logbuf_lock;
  * via per-CPU variable.
  */
 DECLARE_PER_CPU(printk_func_t, printk_func);
-__printf(2, 0)
-static inline int vprintk_func(int level, const char *fmt, va_list args)
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 {
-	return this_cpu_read(printk_func)(level, fmt, args);
+	return this_cpu_read(printk_func)(fmt, args);
 }
 
 extern atomic_t nmi_message_lost;
@@ -47,10 +44,9 @@ static inline int get_nmi_message_lost(void)
 
 #else /* CONFIG_PRINTK_NMI */
 
-__printf(2, 0)
-static inline int vprintk_func(int level, const char *fmt, va_list args)
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 {
-	return vprintk_default(level, fmt, args);
+	return vprintk_default(fmt, args);
 }
 
 static inline int get_nmi_message_lost(void)
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index bc3eeb1ae6da..b69eb8a2876f 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
  * one writer running. But the buffer might get flushed from another
  * CPU, so we need to be careful.
  */
-static int vprintk_nmi(int level, const char *fmt, va_list args)
+static int vprintk_nmi(const char *fmt, va_list args)
 {
 	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
 	int add = 0;
@@ -79,16 +79,7 @@ again:
 	if (!len)
 		smp_rmb();
 
-	if (level != LOGLEVEL_DEFAULT) {
-		add = snprintf(s->buffer + len, sizeof(s->buffer) - len,
-				KERN_SOH "%c", '0' + level);
-		add += vsnprintf(s->buffer + len + add,
-				 sizeof(s->buffer) - len - add,
-				 fmt, args);
-	} else {
-		add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len,
-				fmt, args);
-	}
+	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
 
 	/*
 	 * Do it once again if the buffer has been flushed in the meantime.
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a37fc8cf8e84..eea6dbc2d8cf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1930,26 +1930,7 @@ asmlinkage int printk_emit(int facility, int level,
 }
 EXPORT_SYMBOL(printk_emit);
 
-#define define_pr_level(func, loglevel)				\
-asmlinkage __visible void func(const char *fmt, ...)		\
-{								\
-	va_list args;						\
-								\
-	va_start(args, fmt);					\
-	vprintk_default(loglevel, fmt, args);			\
-	va_end(args);						\
-}								\
-EXPORT_SYMBOL(func)
-
-define_pr_level(__pr_emerg, LOGLEVEL_EMERG);
-define_pr_level(__pr_alert, LOGLEVEL_ALERT);
-define_pr_level(__pr_crit, LOGLEVEL_CRIT);
-define_pr_level(__pr_err, LOGLEVEL_ERR);
-define_pr_level(__pr_warn, LOGLEVEL_WARNING);
-define_pr_level(__pr_notice, LOGLEVEL_NOTICE);
-define_pr_level(__pr_info, LOGLEVEL_INFO);
-
-int vprintk_default(int level, const char *fmt, va_list args)
+int vprintk_default(const char *fmt, va_list args)
 {
 	int r;
 
@@ -1959,7 +1940,7 @@ int vprintk_default(int level, const char *fmt, va_list args)
 		return r;
 	}
 #endif
-	r = vprintk_emit(0, level, NULL, 0, fmt, args);
+	r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
 
 	return r;
 }
@@ -1992,7 +1973,7 @@ asmlinkage __visible int printk(const char *fmt, ...)
 	int r;
 
 	va_start(args, fmt);
-	r = vprintk_func(LOGLEVEL_DEFAULT, fmt, args);
+	r = vprintk_func(fmt, args);
 	va_end(args);
 
 	return r;
-- 
cgit v1.2.3