From def98c84b6cdf2eeea19ec5736e90e316df5206b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 18 Sep 2019 18:43:40 -0700 Subject: workqueue: Fix spurious sanity check failures in destroy_workqueue() Before actually destrying a workqueue, destroy_workqueue() checks whether it's actually idle. If it isn't, it prints out a bunch of warning messages and leaves the workqueue dangling. It unfortunately has a couple issues. * Mayday list queueing increments pwq's refcnts which gets detected as busy and fails the sanity checks. However, because mayday list queueing is asynchronous, this condition can happen without any actual work items left in the workqueue. * Sanity check failure leaves the sysfs interface behind too which can lead to init failure of newer instances of the workqueue. This patch fixes the above two by * If a workqueue has a rescuer, disable and kill the rescuer before sanity checks. Disabling and killing is guaranteed to flush the existing mayday list. * Remove sysfs interface before sanity checks. Signed-off-by: Tejun Heo Reported-by: Marcin Pawlowski Reported-by: "Williams, Gerald S" Cc: stable@vger.kernel.org --- kernel/workqueue.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc2e09a8ea61..93e20f5330fc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4325,9 +4325,28 @@ void destroy_workqueue(struct workqueue_struct *wq) struct pool_workqueue *pwq; int node; + /* + * Remove it from sysfs first so that sanity check failure doesn't + * lead to sysfs name conflicts. + */ + workqueue_sysfs_unregister(wq); + /* drain it before proceeding with destruction */ drain_workqueue(wq); + /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ + if (wq->rescuer) { + struct worker *rescuer = wq->rescuer; + + /* this prevents new queueing */ + spin_lock_irq(&wq_mayday_lock); + wq->rescuer = NULL; + spin_unlock_irq(&wq_mayday_lock); + + /* rescuer will empty maydays list before exiting */ + kthread_stop(rescuer->task); + } + /* sanity checks */ mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { @@ -4359,11 +4378,6 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); - workqueue_sysfs_unregister(wq); - - if (wq->rescuer) - kthread_stop(wq->rescuer->task); - if (!(wq->flags & WQ_UNBOUND)) { wq_unregister_lockdep(wq); /* -- cgit v1.2.3 From 8efe1223d73c218ce7e8b2e0e9aadb974b582d7f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Sep 2019 13:39:57 -0700 Subject: workqueue: Fix missing kfree(rescuer) in destroy_workqueue() Signed-off-by: Tejun Heo Reported-by: Qian Cai Fixes: def98c84b6cd ("workqueue: Fix spurious sanity check failures in destroy_workqueue()") --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 93e20f5330fc..3f067f1d72e3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4345,6 +4345,7 @@ void destroy_workqueue(struct workqueue_struct *wq) /* rescuer will empty maydays list before exiting */ kthread_stop(rescuer->task); + kfree(rescuer); } /* sanity checks */ -- cgit v1.2.3 From 30ae2fc0a75eb5f1a38bbee7355d8e3bc823a6e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Sep 2019 14:09:14 -0700 Subject: workqueue: Minor follow-ups to the rescuer destruction change * Now that wq->rescuer may be cleared while rescuer is still there, switch show_pwq() debug printout to test worker->rescue_wq to identify rescuers intead of testing wq->rescuer. * Update comment on ->rescuer locking. Signed-off-by: Tejun Heo Suggested-by: Lai Jiangshan --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3f067f1d72e3..7f7aa45dac37 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -248,7 +248,7 @@ struct workqueue_struct { struct list_head flusher_overflow; /* WQ: flush overflow list */ struct list_head maydays; /* MD: pwqs requesting rescue */ - struct worker *rescuer; /* I: rescue worker */ + struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* WQ: saved pwq max_active */ @@ -4672,7 +4672,7 @@ static void show_pwq(struct pool_workqueue *pwq) pr_cont("%s %d%s:%ps", comma ? "," : "", task_pid_nr(worker->task), - worker == pwq->wq->rescuer ? "(RESCUER)" : "", + worker->rescue_wq ? "(RESCUER)" : "", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) pr_cont_work(false, work); -- cgit v1.2.3 From 2cb80dbbbaba4f2f86f686c34cb79ea5cbfb0edb Mon Sep 17 00:00:00 2001 From: Iurii Zaikin Date: Mon, 23 Sep 2019 02:02:47 -0700 Subject: kernel/sysctl-test: Add null pointer test for sysctl.c:proc_dointvec() KUnit tests for initialized data behavior of proc_dointvec that is explicitly checked in the code. Includes basic parsing tests including int min/max overflow. Signed-off-by: Iurii Zaikin Signed-off-by: Brendan Higgins Reviewed-by: Greg Kroah-Hartman Reviewed-by: Logan Gunthorpe Acked-by: Luis Chamberlain Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- kernel/Makefile | 2 + kernel/sysctl-test.c | 392 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 394 insertions(+) create mode 100644 kernel/sysctl-test.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index daad787fb795..f0902a7bd1b3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -115,6 +115,8 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o + obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c new file mode 100644 index 000000000000..2a63241a8453 --- /dev/null +++ b/kernel/sysctl-test.c @@ -0,0 +1,392 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of proc sysctl. + */ + +#include +#include + +#define KUNIT_PROC_READ 0 +#define KUNIT_PROC_WRITE 1 + +static int i_zero; +static int i_one_hundred = 100; + +/* + * Test that proc_dointvec will not try to use a NULL .data field even when the + * length is non-zero. + */ +static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test) +{ + struct ctl_table null_data_table = { + .procname = "foo", + /* + * Here we are testing that proc_dointvec behaves correctly when + * we give it a NULL .data field. Normally this would point to a + * piece of memory where the value would be stored. + */ + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + /* + * proc_dointvec expects a buffer in user space, so we allocate one. We + * also need to cast it to __user so sparse doesn't get mad. + */ + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + size_t len; + loff_t pos; + + /* + * We don't care what the starting length is since proc_dointvec should + * not try to read because .data is NULL. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table, + KUNIT_PROC_READ, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); + + /* + * See above. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table, + KUNIT_PROC_WRITE, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Similar to the previous test, we create a struct ctrl_table that has a .data + * field that proc_dointvec cannot do anything with; however, this time it is + * because we tell proc_dointvec that the size is 0. + */ +static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test) +{ + int data = 0; + struct ctl_table data_maxlen_unset_table = { + .procname = "foo", + .data = &data, + /* + * So .data is no longer NULL, but we tell proc_dointvec its + * length is 0, so it still shouldn't try to use it. + */ + .maxlen = 0, + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + size_t len; + loff_t pos; + + /* + * As before, we don't care what buffer length is because proc_dointvec + * cannot do anything because its internal .data buffer has zero length. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table, + KUNIT_PROC_READ, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); + + /* + * See previous comment. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table, + KUNIT_PROC_WRITE, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Here we provide a valid struct ctl_table, but we try to read and write from + * it using a buffer of zero length, so it should still fail in a similar way as + * before. + */ +static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + /* + * However, now our read/write buffer has zero length. + */ + size_t len = 0; + loff_t pos; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer, + &len, &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, buffer, + &len, &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Test that proc_dointvec refuses to read when the file position is non-zero. + */ +static void sysctl_test_api_dointvec_table_read_but_position_set( + struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + /* + * We don't care about our buffer length because we start off with a + * non-zero file position. + */ + size_t len = 1234; + /* + * proc_dointvec should refuse to read into the buffer since the file + * pos is non-zero. + */ + loff_t pos = 1; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer, + &len, &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Test that we can read a two digit number in a sufficiently size buffer. + * Nothing fancy. + */ +static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t len = 4; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + /* Store 13 in the data field. */ + *((int *)table.data) = 13; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, + user_buffer, &len, &pos)); + KUNIT_ASSERT_EQ(test, (size_t)3, len); + buffer[len] = '\0'; + /* And we read 13 back out. */ + KUNIT_EXPECT_STREQ(test, "13\n", buffer); +} + +/* + * Same as previous test, just now with negative numbers. + */ +static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t len = 5; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + *((int *)table.data) = -16; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, + user_buffer, &len, &pos)); + KUNIT_ASSERT_EQ(test, (size_t)4, len); + buffer[len] = '\0'; + KUNIT_EXPECT_STREQ(test, "-16\n", (char *)buffer); +} + +/* + * Test that a simple positive write works. + */ +static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + char input[] = "9"; + size_t len = sizeof(input) - 1; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + + memcpy(buffer, input, len); + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos); + KUNIT_EXPECT_EQ(test, 9, *((int *)table.data)); +} + +/* + * Same as previous test, but now with negative numbers. + */ +static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test) +{ + int data = 0; + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + char input[] = "-9"; + size_t len = sizeof(input) - 1; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + + memcpy(buffer, input, len); + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos); + KUNIT_EXPECT_EQ(test, -9, *((int *)table.data)); +} + +/* + * Test that writing a value smaller than the minimum possible value is not + * allowed. + */ +static void sysctl_test_api_dointvec_write_single_less_int_min( + struct kunit *test) +{ + int data = 0; + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t max_len = 32, len = max_len; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, max_len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + unsigned long abs_of_less_than_min = (unsigned long)INT_MAX + - (INT_MAX + INT_MIN) + 1; + + /* + * We use this rigmarole to create a string that contains a value one + * less than the minimum accepted value. + */ + KUNIT_ASSERT_LT(test, + (size_t)snprintf(buffer, max_len, "-%lu", + abs_of_less_than_min), + max_len); + + KUNIT_EXPECT_EQ(test, -EINVAL, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_EXPECT_EQ(test, max_len, len); + KUNIT_EXPECT_EQ(test, 0, *((int *)table.data)); +} + +/* + * Test that writing the maximum possible value works. + */ +static void sysctl_test_api_dointvec_write_single_greater_int_max( + struct kunit *test) +{ + int data = 0; + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t max_len = 32, len = max_len; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, max_len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + unsigned long greater_than_max = (unsigned long)INT_MAX + 1; + + KUNIT_ASSERT_GT(test, greater_than_max, (unsigned long)INT_MAX); + KUNIT_ASSERT_LT(test, (size_t)snprintf(buffer, max_len, "%lu", + greater_than_max), + max_len); + KUNIT_EXPECT_EQ(test, -EINVAL, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_ASSERT_EQ(test, max_len, len); + KUNIT_EXPECT_EQ(test, 0, *((int *)table.data)); +} + +static struct kunit_case sysctl_test_cases[] = { + KUNIT_CASE(sysctl_test_api_dointvec_null_tbl_data), + KUNIT_CASE(sysctl_test_api_dointvec_table_maxlen_unset), + KUNIT_CASE(sysctl_test_api_dointvec_table_len_is_zero), + KUNIT_CASE(sysctl_test_api_dointvec_table_read_but_position_set), + KUNIT_CASE(sysctl_test_dointvec_read_happy_single_positive), + KUNIT_CASE(sysctl_test_dointvec_read_happy_single_negative), + KUNIT_CASE(sysctl_test_dointvec_write_happy_single_positive), + KUNIT_CASE(sysctl_test_dointvec_write_happy_single_negative), + KUNIT_CASE(sysctl_test_api_dointvec_write_single_less_int_min), + KUNIT_CASE(sysctl_test_api_dointvec_write_single_greater_int_max), + {} +}; + +static struct kunit_suite sysctl_test_suite = { + .name = "sysctl_test", + .test_cases = sysctl_test_cases, +}; + +kunit_test_suite(sysctl_test_suite); -- cgit v1.2.3 From c29eb85386880750130a01aabf288408a6614d65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 23 Sep 2019 11:08:58 -0700 Subject: workqueue: more destroy_workqueue() fixes destroy_workqueue() warnings still, at a lower frequency, trigger spuriously. The problem seems to be in-flight operations which haven't reached put_pwq() yet. * Make sanity check grab all the related locks so that it's synchronized against operations which puts pwq at the end. * Always print out the offending pwq. Signed-off-by: Tejun Heo Cc: "Williams, Gerald S" --- kernel/workqueue.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7f7aa45dac37..4a3c30177b94 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -355,6 +355,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); +static void show_pwq(struct pool_workqueue *pwq); #define CREATE_TRACE_POINTS #include @@ -4314,6 +4315,22 @@ err_destroy: } EXPORT_SYMBOL_GPL(alloc_workqueue); +static bool pwq_busy(struct pool_workqueue *pwq) +{ + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) + if (pwq->nr_in_flight[i]) + return true; + + if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) + return true; + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + return true; + + return false; +} + /** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue @@ -4348,28 +4365,28 @@ void destroy_workqueue(struct workqueue_struct *wq) kfree(rescuer); } - /* sanity checks */ + /* + * Sanity checks - grab all the locks so that we wait for all + * in-flight operations which may do put_pwq(). + */ + mutex_lock(&wq_pool_mutex); mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { - int i; - - for (i = 0; i < WORK_NR_COLORS; i++) { - if (WARN_ON(pwq->nr_in_flight[i])) { - mutex_unlock(&wq->mutex); - show_workqueue_state(); - return; - } - } - - if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) || - WARN_ON(pwq->nr_active) || - WARN_ON(!list_empty(&pwq->delayed_works))) { + spin_lock_irq(&pwq->pool->lock); + if (WARN_ON(pwq_busy(pwq))) { + pr_warning("%s: %s has the following busy pwq (refcnt=%d)\n", + __func__, wq->name, pwq->refcnt); + show_pwq(pwq); + spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); + mutex_unlock(&wq_pool_mutex); show_workqueue_state(); return; } + spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); + mutex_unlock(&wq_pool_mutex); /* * wq list is used to freeze wq, remove from list after -- cgit v1.2.3 From e66b39af00f426b3356b96433d620cb3367ba1ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Sep 2019 06:59:15 -0700 Subject: workqueue: Fix pwq ref leak in rescuer_thread() 008847f66c3 ("workqueue: allow rescuer thread to do more work.") made the rescuer worker requeue the pwq immediately if there may be more work items which need rescuing instead of waiting for the next mayday timer expiration. Unfortunately, it doesn't check whether the pwq is already on the mayday list and unconditionally gets the ref and moves it onto the list. This doesn't corrupt the list but creates an additional reference to the pwq. It got queued twice but will only be removed once. This leak later can trigger pwq refcnt warning on workqueue destruction and prevent freeing of the workqueue. Signed-off-by: Tejun Heo Cc: "Williams, Gerald S" Cc: NeilBrown Cc: stable@vger.kernel.org # v3.19+ --- kernel/workqueue.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4a3c30177b94..4dc8270326d7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2533,8 +2533,14 @@ repeat: */ if (need_to_create_worker(pool)) { spin_lock(&wq_mayday_lock); - get_pwq(pwq); - list_move_tail(&pwq->mayday_node, &wq->maydays); + /* + * Queue iff we aren't racing destruction + * and somebody else hasn't queued it already. + */ + if (wq->rescuer && list_empty(&pwq->mayday_node)) { + get_pwq(pwq); + list_add_tail(&pwq->mayday_node, &wq->maydays); + } spin_unlock(&wq_mayday_lock); } } @@ -4374,8 +4380,8 @@ void destroy_workqueue(struct workqueue_struct *wq) for_each_pwq(pwq, wq) { spin_lock_irq(&pwq->pool->lock); if (WARN_ON(pwq_busy(pwq))) { - pr_warning("%s: %s has the following busy pwq (refcnt=%d)\n", - __func__, wq->name, pwq->refcnt); + pr_warning("%s: %s has the following busy pwq\n", + __func__, wq->name); show_pwq(pwq); spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); @@ -4670,7 +4676,8 @@ static void show_pwq(struct pool_workqueue *pwq) pr_info(" pwq %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, + pr_cont(" active=%d/%d refcnt=%d%s\n", + pwq->nr_active, pwq->max_active, pwq->refcnt, !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); hash_for_each(pool->busy_hash, bkt, worker, hentry) { -- cgit v1.2.3 From 61e867fde21ea94f2166899f24f16cad85cc7b24 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 29 Sep 2019 16:06:58 +0800 Subject: cgroup: short-circuit current_cgns_cgroup_from_root() on the default hierarchy Like commit 13d82fb77abb ("cgroup: short-circuit cset_cgroup_from_root() on the default hierarchy"), short-circuit current_cgns_cgroup_from_root() on the default hierarchy. Signed-off-by: Miaohe Lin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 080561bb8a4b..f6cba23290a1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1374,6 +1374,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) cset = current->nsproxy->cgroup_ns->root_cset; if (cset == &init_css_set) { res = &root->cgrp; + } else if (root == &cgrp_dfl_root) { + res = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; -- cgit v1.2.3 From e7c7b1d85dc1646c874096dac3cf01537c1fd6f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 4 Oct 2019 12:57:39 +0200 Subject: cgroup: Update comments about task exit path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We no longer take cgroup_mutex in cgroup_exit and the exiting tasks are not moved to init_css_set, reflect that in several comments to prevent confusion. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f6cba23290a1..01fc24aeac71 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -899,8 +899,7 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit() changing the css_set to - * init_css_set and dropping the old one. + * against cgroup_exit()/cgroup_free() dropping the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -1432,9 +1431,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) { /* - * No need to lock the task - since we hold cgroup_mutex the - * task can't change groups, so the only thing that can happen - * is that it exits and its css is set back to init_css_set. + * No need to lock the task - since we hold css_set_lock the + * task can't change groups. */ return cset_cgroup_from_root(task_css_set(task), root); } @@ -6030,7 +6028,7 @@ void cgroup_post_fork(struct task_struct *child) struct css_set *cset; spin_lock_irq(&css_set_lock); - cset = task_css_set(current); + cset = task_css_set(current); /* current is @child's parent */ if (list_empty(&child->cg_list)) { get_css_set(cset); cset->nr_tasks++; @@ -6073,20 +6071,8 @@ void cgroup_post_fork(struct task_struct *child) * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * - * Description: Detach cgroup from @tsk and release it. - * - * Note that cgroups marked notify_on_release force every task in - * them to take the global cgroup_mutex mutex when exiting. - * This could impact scaling on very large systems. Be reluctant to - * use notify_on_release cgroups where very high task exit scaling - * is required on large systems. + * Description: Detach cgroup from @tsk. * - * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We - * call cgroup_exit() while the task is still competent to handle - * notify_on_release(), then leave the task attached to the root cgroup in - * each hierarchy for the remainder of its exit. No need to bother with - * init_css_set refcnting. init_css_set never goes away and we can't race - * with migration path - PF_EXITING is visible to migration path. */ void cgroup_exit(struct task_struct *tsk) { @@ -6096,7 +6082,8 @@ void cgroup_exit(struct task_struct *tsk) /* * Unlink from @tsk from its css_set. As migration path can't race - * with us, we can check css_set and cg_list without synchronization. + * with us (thanks to cgroup_threadgroup_rwsem), we can check css_set + * and cg_list without synchronization. */ cset = task_css_set(tsk); @@ -6112,6 +6099,8 @@ void cgroup_exit(struct task_struct *tsk) spin_unlock_irq(&css_set_lock); } else { + /* Take reference to avoid freeing init_css_set in cgroup_free, + * see cgroup_fork(). */ get_css_set(cset); } -- cgit v1.2.3 From 9a3284fad42f66bb43629c6716709ff791aaa457 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 4 Oct 2019 12:57:40 +0200 Subject: cgroup: Optimize single thread migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are reports of users who use thread migrations between cgroups and they report performance drop after d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem"). The effect is pronounced on machines with more CPUs. The migration is affected by forking noise happening in the background, after the mentioned commit a migrating thread must wait for all (forking) processes on the system, not only of its threadgroup. There are several places that need to synchronize with migration: a) do_exit, b) de_thread, c) copy_process, d) cgroup_update_dfl_csses, e) parallel migration (cgroup_{proc,thread}s_write). In the case of self-migrating thread, we relax the synchronization on cgroup_threadgroup_rwsem to avoid the cost of waiting. d) and e) are excluded with cgroup_mutex, c) does not matter in case of single thread migration and the executing thread cannot exec(2) or exit(2) while it is writing into cgroup.threads. In case of do_exit because of signal delivery, we either exit before the migration or finish the migration (of not yet PF_EXITING thread) and die afterwards. This patch handles only the case of self-migration by writing "0" into cgroup.threads. For simplicity, we always take cgroup_threadgroup_rwsem with numeric PIDs. This change improves migration dependent workload performance similar to per-signal_struct state. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 5 +++-- kernel/cgroup/cgroup-v1.c | 5 +++-- kernel/cgroup/cgroup.c | 39 ++++++++++++++++++++++++++++++--------- 3 files changed, 36 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 809e34a3c017..90d1710fef6c 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -231,9 +231,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7f83f4121d8d..09f3a413f6f8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -495,12 +495,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; + bool locked; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup); + task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -522,7 +523,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 01fc24aeac71..8b1c4fd47a7a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2824,7 +2824,8 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, return ret; } -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; @@ -2833,7 +2834,21 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - percpu_down_write(&cgroup_threadgroup_rwsem); + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write + * callers by cgroup_mutex. + * Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + if (pid || threadgroup) { + percpu_down_write(&cgroup_threadgroup_rwsem); + *locked = true; + } else { + *locked = false; + } rcu_read_lock(); if (pid) { @@ -2864,13 +2879,16 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) goto out_unlock_rcu; out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); + if (*locked) { + percpu_up_write(&cgroup_threadgroup_rwsem); + *locked = false; + } out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem) { struct cgroup_subsys *ss; @@ -2879,7 +2897,8 @@ void cgroup_procs_write_finish(struct task_struct *task) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - percpu_up_write(&cgroup_threadgroup_rwsem); + if (locked) + percpu_up_write(&cgroup_threadgroup_rwsem); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -4754,12 +4773,13 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, true); + task = cgroup_procs_write_start(buf, true, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4777,7 +4797,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, true); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); @@ -4795,6 +4815,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; buf = strstrip(buf); @@ -4802,7 +4823,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, false); + task = cgroup_procs_write_start(buf, false, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4826,7 +4847,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); -- cgit v1.2.3 From a23740ec43ba022dbfd139d0fe3eff193216272b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 9 Oct 2019 13:14:57 -0700 Subject: bpf: Track contents of read-only maps as scalars Maps that are read-only both from BPF program side and user space side have their contents constant, so verifier can track referenced values precisely and use that knowledge for dead code elimination, branch pruning, etc. This patch teaches BPF verifier how to do this. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191009201458.2679171-2-andriin@fb.com --- kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ffc3e53f5300..b818fed3208d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2739,6 +2739,41 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) reg->smax_value = reg->umax_value; } +static bool bpf_map_is_rdonly(const struct bpf_map *map) +{ + return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen; +} + +static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) +{ + void *ptr; + u64 addr; + int err; + + err = map->ops->map_direct_value_addr(map, &addr, off); + if (err) + return err; + ptr = (void *)addr + off; + + switch (size) { + case sizeof(u8): + *val = (u64)*(u8 *)ptr; + break; + case sizeof(u16): + *val = (u64)*(u16 *)ptr; + break; + case sizeof(u32): + *val = (u64)*(u32 *)ptr; + break; + case sizeof(u64): + *val = *(u64 *)ptr; + break; + default: + return -EINVAL; + } + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -2776,9 +2811,27 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; err = check_map_access(env, regno, off, size, false); - if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); + if (!err && t == BPF_READ && value_regno >= 0) { + struct bpf_map *map = reg->map_ptr; + + /* if map is read-only, track its contents as scalars */ + if (tnum_is_const(reg->var_off) && + bpf_map_is_rdonly(map) && + map->ops->map_direct_value_addr) { + int map_off = off + reg->var_off.value; + u64 val = 0; + err = bpf_map_direct_read(map, map_off, size, + &val); + if (err) + return err; + + regs[value_regno].type = SCALAR_VALUE; + __mark_reg_known(®s[value_regno], val); + } else { + mark_reg_unknown(env, regs, value_regno); + } + } } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; -- cgit v1.2.3 From 2dedd7d2165565bafa89718eaadfc5d1a7865f66 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 11 Oct 2019 10:20:53 -0700 Subject: bpf: Fix cast to pointer from integer of different size warning Fix "warning: cast to pointer from integer of different size" when casting u64 addr to void *. Fixes: a23740ec43ba ("bpf: Track contents of read-only maps as scalars") Reported-by: kbuild test robot Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191011172053.2980619-1-andriin@fb.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b818fed3208d..d3446f018b9a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2753,7 +2753,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) err = map->ops->map_direct_value_addr(map, &addr, off); if (err) return err; - ptr = (void *)addr + off; + ptr = (void *)(long)addr + off; switch (size) { case sizeof(u8): -- cgit v1.2.3 From 15d42eb26bdee47c0278fbdab4198577bc6a97b5 Mon Sep 17 00:00:00 2001 From: Christian Kellner Date: Mon, 14 Oct 2019 18:20:32 +0200 Subject: pidfd: add NSpid entries to fdinfo Currently, the fdinfo file contains the Pid field which shows the pid a given pidfd refers to in the pid namespace of the procfs instance. If pid namespaces are configured, also show an NSpid field for easy retrieval of the pid in all descendant pid namespaces. If the pid namespace of the process is not a descendant of the pid namespace of the procfs instance 0 will be shown as its first NSpid entry and no other entries will be shown. Add a block comment to pidfd_show_fdinfo with a detailed explanation of Pid and NSpid fields. Co-developed-by: Christian Brauner Signed-off-by: Christian Brauner Signed-off-by: Christian Kellner Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191014162034.2185-1-ckellner@redhat.com Signed-off-by: Christian Brauner --- kernel/fork.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..782986962d47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1695,12 +1695,63 @@ static int pidfd_release(struct inode *inode, struct file *file) } #ifdef CONFIG_PROC_FS +/** + * pidfd_show_fdinfo - print information about a pidfd + * @m: proc fdinfo file + * @f: file referencing a pidfd + * + * Pid: + * This function will print the pid that a given pidfd refers to in the + * pid namespace of the procfs instance. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its pid. This is + * similar to calling getppid() on a process whose parent is outside of + * its pid namespace. + * + * NSpid: + * If pid namespaces are supported then this function will also print + * the pid of a given pidfd refers to for all descendant pid namespaces + * starting from the current pid namespace of the instance, i.e. the + * Pid field and the first entry in the NSpid field will be identical. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its first NSpid + * entry and no others will be shown. + * Note that this differs from the Pid and NSpid fields in + * /proc//status where Pid and NSpid are always shown relative to + * the pid namespace of the procfs instance. The difference becomes + * obvious when sending around a pidfd between pid namespaces from a + * different branch of the tree, i.e. where no ancestoral relation is + * present between the pid namespaces: + * - create two new pid namespaces ns1 and ns2 in the initial pid + * namespace (also take care to create new mount namespaces in the + * new pid namespace and mount procfs) + * - create a process with a pidfd in ns1 + * - send pidfd from ns1 to ns2 + * - read /proc/self/fdinfo/ and observe that both Pid and NSpid + * have exactly one entry, which is 0 + */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); struct pid *pid = f->private_data; + pid_t nr = pid_nr_ns(pid, ns); + + seq_put_decimal_ull(m, "Pid:\t", nr); - seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); +#ifdef CONFIG_PID_NS + seq_put_decimal_ull(m, "\nNSpid:\t", nr); + if (nr) { + int i; + + /* If nr is non-zero it means that 'pid' is valid and that + * ns, i.e. the pid namespace associated with the procfs + * instance, is in the pid namespace hierarchy of pid. + * Start at one below the already printed level. + */ + for (i = ns->level + 1; i <= pid->level; i++) + seq_put_decimal_ull(m, "\t", pid->numbers[i].nr); + } +#endif seq_putc(m, '\n'); } #endif -- cgit v1.2.3 From eac9153f2b584c702cea02c1f1a57d85aa9aea42 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 14 Oct 2019 10:12:23 -0700 Subject: bpf/stackmap: Fix deadlock with rq_lock in bpf_get_stack() bpf stackmap with build-id lookup (BPF_F_STACK_BUILD_ID) can trigger A-A deadlock on rq_lock(): rcu: INFO: rcu_sched detected stalls on CPUs/tasks: [...] Call Trace: try_to_wake_up+0x1ad/0x590 wake_up_q+0x54/0x80 rwsem_wake+0x8a/0xb0 bpf_get_stack+0x13c/0x150 bpf_prog_fbdaf42eded9fe46_on_event+0x5e3/0x1000 bpf_overflow_handler+0x60/0x100 __perf_event_overflow+0x4f/0xf0 perf_swevent_overflow+0x99/0xc0 ___perf_sw_event+0xe7/0x120 __schedule+0x47d/0x620 schedule+0x29/0x90 futex_wait_queue_me+0xb9/0x110 futex_wait+0x139/0x230 do_futex+0x2ac/0xa50 __x64_sys_futex+0x13c/0x180 do_syscall_64+0x42/0x100 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This can be reproduced by: 1. Start a multi-thread program that does parallel mmap() and malloc(); 2. taskset the program to 2 CPUs; 3. Attach bpf program to trace_sched_switch and gather stackmap with build-id, e.g. with trace.py from bcc tools: trace.py -U -p -s t:sched:sched_switch A sample reproducer is attached at the end. This could also trigger deadlock with other locks that are nested with rq_lock. Fix this by checking whether irqs are disabled. Since rq_lock and all other nested locks are irq safe, it is safe to do up_read() when irqs are not disable. If the irqs are disabled, postpone up_read() in irq_work. Fixes: 615755a77b24 ("bpf: extend stackmap to save binary_build_id+offset instead of address") Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Cc: Peter Zijlstra Cc: Alexei Starovoitov Cc: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191014171223.357174-1-songliubraving@fb.com Reproducer: ============================ 8< ============================ char *filename; void *worker(void *p) { void *ptr; int fd; char *pptr; fd = open(filename, O_RDONLY); if (fd < 0) return NULL; while (1) { struct timespec ts = {0, 1000 + rand() % 2000}; ptr = mmap(NULL, 4096 * 64, PROT_READ, MAP_PRIVATE, fd, 0); usleep(1); if (ptr == MAP_FAILED) { printf("failed to mmap\n"); break; } munmap(ptr, 4096 * 64); usleep(1); pptr = malloc(1); usleep(1); pptr[0] = 1; usleep(1); free(pptr); usleep(1); nanosleep(&ts, NULL); } close(fd); return NULL; } int main(int argc, char *argv[]) { void *ptr; int i; pthread_t threads[THREAD_COUNT]; if (argc < 2) return 0; filename = argv[1]; for (i = 0; i < THREAD_COUNT; i++) { if (pthread_create(threads + i, NULL, worker, NULL)) { fprintf(stderr, "Error creating thread\n"); return 0; } } for (i = 0; i < THREAD_COUNT; i++) pthread_join(threads[i], NULL); return 0; } ============================ 8< ============================ --- kernel/bpf/stackmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 052580c33d26..173e983619d7 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -287,7 +287,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, bool irq_work_busy = false; struct stack_map_irq_work *work = NULL; - if (in_nmi()) { + if (irqs_disabled()) { work = this_cpu_ptr(&up_read_work); if (work->irq_work.flags & IRQ_WORK_BUSY) /* cannot queue more up_read, fallback */ @@ -295,8 +295,9 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } /* - * We cannot do up_read() in nmi context. To do build_id lookup - * in nmi context, we need to run up_read() in irq_work. We use + * We cannot do up_read() when the irq is disabled, because of + * risk to deadlock with rq_lock. To do build_id lookup when the + * irqs are disabled, we need to run up_read() in irq_work. We use * a percpu variable to do the irq_work. If the irq_work is * already used by another lookup, we fall back to report ips. * -- cgit v1.2.3 From 3d6d8da48d0b214d65ea0227d47228abc75d7c88 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:28 +0200 Subject: pidfd: check pid has attached task in fdinfo Currently, when a task is dead we still print the pid it used to use in the fdinfo files of its pidfds. This doesn't make much sense since the pid may have already been reused. So verify that the task is still alive by introducing the pid_has_task() helper which will be used by other callers in follow-up patches. If the task is not alive anymore, we will print -1. This allows us to differentiate between a task not being present in a given pid namespace - in which case we already print 0 - and a task having been reaped. Note that this uses PIDTYPE_PID for the check. Technically, we could've checked PIDTYPE_TGID since pidfds currently only refer to thread-group leaders but if they won't anymore in the future then this check becomes problematic without it being immediately obvious to non-experts imho. If a thread is created via clone(CLONE_THREAD) than struct pid has a single non-empty list pid->tasks[PIDTYPE_PID] and this pid can't be used as a PIDTYPE_TGID meaning pid->tasks[PIDTYPE_TGID] will return NULL even though the thread-group leader might still be very much alive. So checking PIDTYPE_PID is fine and is easier to maintain should we ever allow pidfds to refer to threads. Cc: Jann Horn Cc: Christian Kellner Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-1-christian.brauner@ubuntu.com --- kernel/fork.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 782986962d47..ffa314838b43 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1732,15 +1732,20 @@ static int pidfd_release(struct inode *inode, struct file *file) */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { - struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); struct pid *pid = f->private_data; - pid_t nr = pid_nr_ns(pid, ns); + struct pid_namespace *ns; + pid_t nr = -1; - seq_put_decimal_ull(m, "Pid:\t", nr); + if (likely(pid_has_task(pid, PIDTYPE_PID))) { + ns = proc_pid_ns(file_inode(m->file)); + nr = pid_nr_ns(pid, ns); + } + + seq_put_decimal_ll(m, "Pid:\t", nr); #ifdef CONFIG_PID_NS - seq_put_decimal_ull(m, "\nNSpid:\t", nr); - if (nr) { + seq_put_decimal_ll(m, "\nNSpid:\t", nr); + if (nr > 0) { int i; /* If nr is non-zero it means that 'pid' is valid and that @@ -1749,7 +1754,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) * Start at one below the already printed level. */ for (i = ns->level + 1; i <= pid->level; i++) - seq_put_decimal_ull(m, "\t", pid->numbers[i].nr); + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); } #endif seq_putc(m, '\n'); -- cgit v1.2.3 From 1d416a113f0c0da7a3608ffe4d44bd8e9c4859fa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:30 +0200 Subject: pid: use pid_has_task() in __change_pid() Replace hlist_empty() with the new pid_has_task() helper which is more idiomatic, easier to grep for, and unifies how callers perform this check. Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-3-christian.brauner@ubuntu.com --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 0a9f2e437217..124d40b239b1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -299,7 +299,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type, *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) - if (!hlist_empty(&pid->tasks[tmp])) + if (pid_has_task(pid, tmp)) return; free_pid(pid); -- cgit v1.2.3 From 1722c14a2097634a7ba37000c0ec7d9409918b64 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:31 +0200 Subject: exit: use pid_has_task() in do_wait() Replace hlist_empty() with the new pid_has_task() helper which is more idiomatic, easier to grep for, and unifies how callers perform this check. Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-4-christian.brauner@ubuntu.com --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a46a50d67002..f2d20ab74422 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1457,7 +1457,7 @@ repeat: */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && - (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) + (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; set_current_state(TASK_INTERRUPTIBLE); -- cgit v1.2.3 From 1e1d0f0b1a3e3533ea4cd4021eb251e53827c70b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:32 +0200 Subject: pid: use pid_has_task() in pidfd_open() Use the new pid_has_task() helper in pidfd_open(). This simplifies the code and avoids taking rcu_read_{lock,unlock}() and leads to overall nicer code. Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-5-christian.brauner@ubuntu.com --- kernel/pid.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 124d40b239b1..7b5f6c963d72 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -497,7 +497,7 @@ static int pidfd_create(struct pid *pid) */ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) { - int fd, ret; + int fd; struct pid *p; if (flags) @@ -510,13 +510,11 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) if (!p) return -ESRCH; - ret = 0; - rcu_read_lock(); - if (!pid_task(p, PIDTYPE_TGID)) - ret = -EINVAL; - rcu_read_unlock(); + if (pid_has_task(p, PIDTYPE_TGID)) + fd = pidfd_create(p); + else + fd = -EINVAL; - fd = ret ?: pidfd_create(p); put_pid(p); return fd; } -- cgit v1.2.3 From 8580ac9404f6240668a026785d7d8856f0530409 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:57 -0700 Subject: bpf: Process in-kernel BTF If in-kernel BTF exists parse it and prepare 'struct btf *btf_vmlinux' for further use by the verifier. In-kernel BTF is trusted just like kallsyms and other build artifacts embedded into vmlinux. Yet run this BTF image through BTF verifier to make sure that it is valid and it wasn't mangled during the build. If in-kernel BTF is incorrect it means either gcc or pahole or kernel are buggy. In such case disallow loading BPF programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-4-ast@kernel.org --- kernel/bpf/btf.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 20 +++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 29c7c06c6bd6..ddeab1e8d21e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -698,6 +698,13 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; + /* btf verifier prints all types it is processing via + * btf_verifier_log_type(..., fmt = NULL). + * Skip those prints for in-kernel BTF verification. + */ + if (log->level == BPF_LOG_KERNEL && !fmt) + return; + __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, btf_kind_str[kind], @@ -735,6 +742,8 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; + if (log->level == BPF_LOG_KERNEL && !fmt) + return; /* The CHECK_META phase already did a btf dump. * * If member is logged again, it must hit an error in @@ -777,6 +786,8 @@ static void btf_verifier_log_vsi(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; + if (log->level == BPF_LOG_KERNEL && !fmt) + return; if (env->phase != CHECK_META) btf_verifier_log_type(env, datasec_type, NULL); @@ -802,6 +813,8 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env, if (!bpf_verifier_log_needed(log)) return; + if (log->level == BPF_LOG_KERNEL) + return; hdr = &btf->hdr; __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); __btf_verifier_log(log, "version: %u\n", hdr->version); @@ -2405,7 +2418,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } - + if (env->log.level == BPF_LOG_KERNEL) + continue; btf_verifier_log(env, "\t%s val=%d\n", __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); @@ -3367,6 +3381,61 @@ errout: return ERR_PTR(err); } +extern char __weak _binary__btf_vmlinux_bin_start[]; +extern char __weak _binary__btf_vmlinux_bin_end[]; + +struct btf *btf_parse_vmlinux(void) +{ + struct btf_verifier_env *env = NULL; + struct bpf_verifier_log *log; + struct btf *btf = NULL; + int err; + + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); + if (!env) + return ERR_PTR(-ENOMEM); + + log = &env->log; + log->level = BPF_LOG_KERNEL; + + btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); + if (!btf) { + err = -ENOMEM; + goto errout; + } + env->btf = btf; + + btf->data = _binary__btf_vmlinux_bin_start; + btf->data_size = _binary__btf_vmlinux_bin_end - + _binary__btf_vmlinux_bin_start; + + err = btf_parse_hdr(env); + if (err) + goto errout; + + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + + err = btf_parse_str_sec(env); + if (err) + goto errout; + + err = btf_check_all_metas(env); + if (err) + goto errout; + + btf_verifier_env_free(env); + refcount_set(&btf->refcnt, 1); + return btf; + +errout: + btf_verifier_env_free(env); + if (btf) { + kvfree(btf->types); + kfree(btf); + } + return ERR_PTR(err); +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d3446f018b9a..466b3b19de4d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -207,6 +207,8 @@ struct bpf_call_arg_meta { int func_id; }; +struct btf *btf_vmlinux; + static DEFINE_MUTEX(bpf_verifier_lock); static const struct bpf_line_info * @@ -243,6 +245,10 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, n = min(log->len_total - log->len_used - 1, n); log->kbuf[n] = '\0'; + if (log->level == BPF_LOG_KERNEL) { + pr_err("BPF:%s\n", log->kbuf); + return; + } if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) log->len_used += n; else @@ -9294,6 +9300,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->ops = bpf_verifier_ops[env->prog->type]; is_priv = capable(CAP_SYS_ADMIN); + if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + mutex_lock(&bpf_verifier_lock); + if (!btf_vmlinux) + btf_vmlinux = btf_parse_vmlinux(); + mutex_unlock(&bpf_verifier_lock); + } + /* grab the mutex to protect few globals used by verifier */ if (!is_priv) mutex_lock(&bpf_verifier_lock); @@ -9313,6 +9326,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto err_unlock; } + if (IS_ERR(btf_vmlinux)) { + /* Either gcc or pahole or kernel are broken. */ + verbose(env, "in-kernel BTF is malformed\n"); + ret = PTR_ERR(btf_vmlinux); + goto err_unlock; + } + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; -- cgit v1.2.3 From ccfe29eb29c2edcea6552072ef00ff4117f53e83 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:58 -0700 Subject: bpf: Add attach_btf_id attribute to program load Add attach_btf_id attribute to prog_load command. It's similar to existing expected_attach_type attribute which is used in several cgroup based program types. Unfortunately expected_attach_type is ignored for tracing programs and cannot be reused for new purpose. Hence introduce attach_btf_id to verify bpf programs against given in-kernel BTF type id at load time. It is strictly checked to be valid for raw_tp programs only. In a later patches it will become: btf_id == 0 semantics of existing raw_tp progs. btd_id > 0 raw_tp with BTF and additional type safety. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-5-ast@kernel.org --- kernel/bpf/syscall.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 82eabd4e38ad..b56c482c9760 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -1565,8 +1566,9 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) } static int -bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, - enum bpf_attach_type expected_attach_type) +bpf_prog_load_check_attach(enum bpf_prog_type prog_type, + enum bpf_attach_type expected_attach_type, + u32 btf_id) { switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: @@ -1608,13 +1610,19 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_RAW_TRACEPOINT: + if (btf_id > BTF_MAX_TYPE) + return -EINVAL; + return 0; default: + if (btf_id) + return -EINVAL; return 0; } } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD line_info_cnt +#define BPF_PROG_LOAD_LAST_FIELD attach_btf_id static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -1656,7 +1664,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return -EPERM; bpf_prog_load_fixup_attach_type(attr); - if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) + if (bpf_prog_load_check_attach(type, attr->expected_attach_type, + attr->attach_btf_id)) return -EINVAL; /* plain bpf_prog allocation */ @@ -1665,6 +1674,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return -ENOMEM; prog->expected_attach_type = attr->expected_attach_type; + prog->aux->attach_btf_id = attr->attach_btf_id; prog->aux->offload_requested = !!attr->prog_ifindex; -- cgit v1.2.3 From 9e15db66136a14cde3f35691f1d839d950118826 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:00 -0700 Subject: bpf: Implement accurate raw_tp context access via BTF libbpf analyzes bpf C program, searches in-kernel BTF for given type name and stores it into expected_attach_type. The kernel verifier expects this btf_id to point to something like: typedef void (*btf_trace_kfree_skb)(void *, struct sk_buff *skb, void *loc); which represents signature of raw_tracepoint "kfree_skb". Then btf_ctx_access() matches ctx+0 access in bpf program with 'skb' and 'ctx+8' access with 'loc' arguments of "kfree_skb" tracepoint. In first case it passes btf_id of 'struct sk_buff *' back to the verifier core and 'void *' in second case. Then the verifier tracks PTR_TO_BTF_ID as any other pointer type. Like PTR_TO_SOCKET points to 'struct bpf_sock', PTR_TO_TCP_SOCK points to 'struct bpf_tcp_sock', and so on. PTR_TO_BTF_ID points to in-kernel structs. If 1234 is btf_id of 'struct sk_buff' in vmlinux's BTF then PTR_TO_BTF_ID#1234 points to one of in kernel skbs. When PTR_TO_BTF_ID#1234 is dereferenced (like r2 = *(u64 *)r1 + 32) the btf_struct_access() checks which field of 'struct sk_buff' is at offset 32. Checks that size of access matches type definition of the field and continues to track the dereferenced type. If that field was a pointer to 'struct net_device' the r2's type will be PTR_TO_BTF_ID#456. Where 456 is btf_id of 'struct net_device' in vmlinux's BTF. Such verifier analysis prevents "cheating" in BPF C program. The program cannot cast arbitrary pointer to 'struct sk_buff *' and access it. C compiler would allow type cast, of course, but the verifier will notice type mismatch based on BPF assembly and in-kernel BTF. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-7-ast@kernel.org --- kernel/bpf/btf.c | 190 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 88 +++++++++++++++++++++- kernel/trace/bpf_trace.c | 2 +- 3 files changed, 276 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ddeab1e8d21e..271d27cd427f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3436,6 +3436,196 @@ errout: return ERR_PTR(err); } +extern struct btf *btf_vmlinux; + +bool btf_ctx_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + struct bpf_verifier_log *log = info->log; + u32 btf_id = prog->aux->attach_btf_id; + const struct btf_param *args; + const struct btf_type *t; + const char prefix[] = "btf_trace_"; + const char *tname; + u32 nr_args, arg; + + if (!btf_id) + return true; + + if (IS_ERR(btf_vmlinux)) { + bpf_log(log, "btf_vmlinux is malformed\n"); + return false; + } + + t = btf_type_by_id(btf_vmlinux, btf_id); + if (!t || BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { + bpf_log(log, "btf_id is invalid\n"); + return false; + } + + tname = __btf_name_by_offset(btf_vmlinux, t->name_off); + if (strncmp(prefix, tname, sizeof(prefix) - 1)) { + bpf_log(log, "btf_id points to wrong type name %s\n", tname); + return false; + } + tname += sizeof(prefix) - 1; + + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_ptr(t)) + return false; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + return false; + + if (off % 8) { + bpf_log(log, "raw_tp '%s' offset %d is not multiple of 8\n", + tname, off); + return false; + } + arg = off / 8; + args = (const struct btf_param *)(t + 1); + /* skip first 'void *__data' argument in btf_trace_##name typedef */ + args++; + nr_args = btf_type_vlen(t) - 1; + if (arg >= nr_args) { + bpf_log(log, "raw_tp '%s' doesn't have %d-th argument\n", + tname, arg); + return false; + } + + t = btf_type_by_id(btf_vmlinux, args[arg].type); + /* skip modifiers */ + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf_vmlinux, t->type); + if (btf_type_is_int(t)) + /* accessing a scalar */ + return true; + if (!btf_type_is_ptr(t)) { + bpf_log(log, + "raw_tp '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", + tname, arg, + __btf_name_by_offset(btf_vmlinux, t->name_off), + btf_kind_str[BTF_INFO_KIND(t->info)]); + return false; + } + if (t->type == 0) + /* This is a pointer to void. + * It is the same as scalar from the verifier safety pov. + * No further pointer walking is allowed. + */ + return true; + + /* this is a pointer to another type */ + info->reg_type = PTR_TO_BTF_ID; + info->btf_id = t->type; + + t = btf_type_by_id(btf_vmlinux, t->type); + /* skip modifiers */ + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_struct(t)) { + bpf_log(log, + "raw_tp '%s' arg%d type %s is not a struct\n", + tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]); + return false; + } + bpf_log(log, "raw_tp '%s' arg%d has btf_id %d type %s '%s'\n", + tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], + __btf_name_by_offset(btf_vmlinux, t->name_off)); + return true; +} + +int btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, + u32 *next_btf_id) +{ + const struct btf_member *member; + const struct btf_type *mtype; + const char *tname, *mname; + int i, moff = 0, msize; + +again: + tname = __btf_name_by_offset(btf_vmlinux, t->name_off); + if (!btf_type_is_struct(t)) { + bpf_log(log, "Type '%s' is not a struct", tname); + return -EINVAL; + } + + for_each_member(i, t, member) { + /* offset of the field in bits */ + moff = btf_member_bit_offset(t, member); + + if (btf_member_bitfield_size(t, member)) + /* bitfields are not supported yet */ + continue; + + if (off + size <= moff / 8) + /* won't find anything, field is already too far */ + break; + + /* type of the field */ + mtype = btf_type_by_id(btf_vmlinux, member->type); + mname = __btf_name_by_offset(btf_vmlinux, member->name_off); + + /* skip modifiers */ + while (btf_type_is_modifier(mtype)) + mtype = btf_type_by_id(btf_vmlinux, mtype->type); + + if (btf_type_is_array(mtype)) + /* array deref is not supported yet */ + continue; + + if (!btf_type_has_size(mtype) && !btf_type_is_ptr(mtype)) { + bpf_log(log, "field %s doesn't have size\n", mname); + return -EFAULT; + } + if (btf_type_is_ptr(mtype)) + msize = 8; + else + msize = mtype->size; + if (off >= moff / 8 + msize) + /* no overlap with member, keep iterating */ + continue; + /* the 'off' we're looking for is either equal to start + * of this field or inside of this struct + */ + if (btf_type_is_struct(mtype)) { + /* our field must be inside that union or struct */ + t = mtype; + + /* adjust offset we're looking for */ + off -= moff / 8; + goto again; + } + if (msize != size) { + /* field access size doesn't match */ + bpf_log(log, + "cannot access %d bytes in struct %s field %s that has size %d\n", + size, tname, mname, msize); + return -EACCES; + } + + if (btf_type_is_ptr(mtype)) { + const struct btf_type *stype; + + stype = btf_type_by_id(btf_vmlinux, mtype->type); + /* skip modifiers */ + while (btf_type_is_modifier(stype)) + stype = btf_type_by_id(btf_vmlinux, stype->type); + if (btf_type_is_struct(stype)) { + *next_btf_id = mtype->type; + return PTR_TO_BTF_ID; + } + } + /* all other fields are treated as scalars */ + return SCALAR_VALUE; + } + bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off); + return -EINVAL; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 466b3b19de4d..42a463e09761 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -286,6 +286,19 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } +__printf(2, 3) void bpf_log(struct bpf_verifier_log *log, + const char *fmt, ...) +{ + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} + static const char *ltrim(const char *s) { while (isspace(*s)) @@ -406,6 +419,7 @@ static const char * const reg_type_str[] = { [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", + [PTR_TO_BTF_ID] = "ptr_", }; static char slot_type_char[] = { @@ -436,6 +450,12 @@ static struct bpf_func_state *func(struct bpf_verifier_env *env, return cur->frame[reg->frameno]; } +const char *kernel_type_name(u32 id) +{ + return btf_name_by_offset(btf_vmlinux, + btf_type_by_id(btf_vmlinux, id)->name_off); +} + static void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state) { @@ -460,6 +480,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { + if (t == PTR_TO_BTF_ID) + verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); @@ -2337,10 +2359,12 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, - enum bpf_access_type t, enum bpf_reg_type *reg_type) + enum bpf_access_type t, enum bpf_reg_type *reg_type, + u32 *btf_id) { struct bpf_insn_access_aux info = { .reg_type = *reg_type, + .log = &env->log, }; if (env->ops->is_valid_access && @@ -2354,7 +2378,10 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + if (*reg_type == PTR_TO_BTF_ID) + *btf_id = info.btf_id; + else + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -2780,6 +2807,53 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) return 0; } +static int check_ptr_to_btf_access(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, + int regno, int off, int size, + enum bpf_access_type atype, + int value_regno) +{ + struct bpf_reg_state *reg = regs + regno; + const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id); + const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off); + u32 btf_id; + int ret; + + if (atype != BPF_READ) { + verbose(env, "only read is supported\n"); + return -EACCES; + } + + if (off < 0) { + verbose(env, + "R%d is ptr_%s invalid negative access: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, + "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n", + regno, tname, off, tn_buf); + return -EACCES; + } + + ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + if (ret < 0) + return ret; + + if (ret == SCALAR_VALUE) { + mark_reg_unknown(env, regs, value_regno); + return 0; + } + mark_reg_known_zero(env, regs, value_regno); + regs[value_regno].type = PTR_TO_BTF_ID; + regs[value_regno].btf_id = btf_id; + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -2840,6 +2914,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; + u32 btf_id = 0; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -2851,7 +2926,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf_id); + if (err) + verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter @@ -2870,6 +2947,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; + if (reg_type == PTR_TO_BTF_ID) + regs[value_regno].btf_id = btf_id; } regs[value_regno].type = reg_type; } @@ -2929,6 +3008,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_tp_buffer_access(env, reg, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_BTF_ID) { + err = check_ptr_to_btf_access(env, regs, regno, off, size, t, + value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 44bd08f2443b..6221e8c6ecc3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1074,7 +1074,7 @@ static bool raw_tp_prog_is_valid_access(int off, int size, return false; if (off % size != 0) return false; - return true; + return btf_ctx_access(off, size, type, prog, info); } const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { -- cgit v1.2.3 From ac4414b5ca47d16c8de3134cc1b868056c4a68ea Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:01 -0700 Subject: bpf: Attach raw_tp program with BTF via type name BTF type id specified at program load time has all necessary information to attach that program to raw tracepoint. Use kernel type name to find raw tracepoint. Add missing CHECK_ATTR() condition. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-8-ast@kernel.org --- kernel/bpf/syscall.c | 70 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b56c482c9760..523e3ac15a08 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1816,17 +1816,52 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) struct bpf_raw_tracepoint *raw_tp; struct bpf_raw_event_map *btp; struct bpf_prog *prog; - char tp_name[128]; + const char *tp_name; + char buf[128]; int tp_fd, err; - if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name), - sizeof(tp_name) - 1) < 0) - return -EFAULT; - tp_name[sizeof(tp_name) - 1] = 0; + if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) + return -EINVAL; + + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { + err = -EINVAL; + goto out_put_prog; + } + + if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->aux->attach_btf_id) { + if (attr->raw_tracepoint.name) { + /* raw_tp name should not be specified in raw_tp + * programs that were verified via in-kernel BTF info + */ + err = -EINVAL; + goto out_put_prog; + } + /* raw_tp name is taken from type name instead */ + tp_name = kernel_type_name(prog->aux->attach_btf_id); + /* skip the prefix */ + tp_name += sizeof("btf_trace_") - 1; + } else { + if (strncpy_from_user(buf, + u64_to_user_ptr(attr->raw_tracepoint.name), + sizeof(buf) - 1) < 0) { + err = -EFAULT; + goto out_put_prog; + } + buf[sizeof(buf) - 1] = 0; + tp_name = buf; + } btp = bpf_get_raw_tracepoint(tp_name); - if (!btp) - return -ENOENT; + if (!btp) { + err = -ENOENT; + goto out_put_prog; + } raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); if (!raw_tp) { @@ -1834,38 +1869,27 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) goto out_put_btp; } raw_tp->btp = btp; - - prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto out_free_tp; - } - if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && - prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { - err = -EINVAL; - goto out_put_prog; - } + raw_tp->prog = prog; err = bpf_probe_register(raw_tp->btp, prog); if (err) - goto out_put_prog; + goto out_free_tp; - raw_tp->prog = prog; tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, O_CLOEXEC); if (tp_fd < 0) { bpf_probe_unregister(raw_tp->btp, prog); err = tp_fd; - goto out_put_prog; + goto out_free_tp; } return tp_fd; -out_put_prog: - bpf_prog_put(prog); out_free_tp: kfree(raw_tp); out_put_btp: bpf_put_raw_tracepoint(btp); +out_put_prog: + bpf_prog_put(prog); return err; } -- cgit v1.2.3 From 2a02759ef5f8a34792df22b41d5e10658fd7bbd3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:02 -0700 Subject: bpf: Add support for BTF pointers to interpreter Pointer to BTF object is a pointer to kernel object or NULL. The memory access in the interpreter has to be done via probe_kernel_read to avoid page faults. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-9-ast@kernel.org --- kernel/bpf/core.c | 19 +++++++++++++++++++ kernel/bpf/verifier.c | 8 ++++++++ 2 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 66088a9e9b9e..8a765bbd33f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1291,6 +1291,11 @@ bool bpf_opcode_in_insntable(u8 code) } #ifndef CONFIG_BPF_JIT_ALWAYS_ON +u64 __weak bpf_probe_read(void * dst, u32 size, const void * unsafe_ptr) +{ + memset(dst, 0, size); + return -EFAULT; +} /** * __bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -1310,6 +1315,10 @@ static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u6 /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, + [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, + [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, + [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, + [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW, }; #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL @@ -1542,6 +1551,16 @@ out: LDST(W, u32) LDST(DW, u64) #undef LDST +#define LDX_PROBE(SIZEOP, SIZE) \ + LDX_PROBE_MEM_##SIZEOP: \ + bpf_probe_read(&DST, SIZE, (const void *)(long) SRC); \ + CONT; + LDX_PROBE(B, 1) + LDX_PROBE(H, 2) + LDX_PROBE(W, 4) + LDX_PROBE(DW, 8) +#undef LDX_PROBE + STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ atomic_add((u32) SRC, (atomic_t *)(unsigned long) (DST + insn->off)); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 42a463e09761..c4b6a2cfcd47 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7581,6 +7581,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: + case PTR_TO_BTF_ID: return false; default: return true; @@ -8722,6 +8723,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_XDP_SOCK: convert_ctx_access = bpf_xdp_sock_convert_ctx_access; break; + case PTR_TO_BTF_ID: + if (type == BPF_WRITE) { + verbose(env, "Writes through BTF pointers are not allowed\n"); + return -EINVAL; + } + insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); + continue; default: continue; } -- cgit v1.2.3 From 3dec541b2e632d630fe7142ed44f0b3702ef1f8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:03 -0700 Subject: bpf: Add support for BTF pointers to x86 JIT Pointer to BTF object is a pointer to kernel object or NULL. Such pointers can only be used by BPF_LDX instructions. The verifier changed their opcode from LDX|MEM|size to LDX|PROBE_MEM|size to make JITing easier. The number of entries in extable is the number of BPF_LDX insns that access kernel memory via "pointer to BTF type". Only these load instructions can fault. Since x86 extable is relative it has to be allocated in the same memory region as JITed code. Allocate it prior to last pass of JITing and let the last pass populate it. Pointer to extable in bpf_prog_aux is necessary to make page fault handling fast. Page fault handling is done in two steps: 1. bpf_prog_kallsyms_find() finds BPF program that page faulted. It's done by walking rb tree. 2. then extable for given bpf program is binary searched. This process is similar to how page faulting is done for kernel modules. The exception handler skips over faulting x86 instruction and initializes destination register with zero. This mimics exact behavior of bpf_probe_read (when probe_kernel_read faults dest is zeroed). JITs for other architectures can add support in similar way. Until then they will reject unknown opcode and fallback to interpreter. Since extable should be aligned and placed near JITed code make bpf_jit_binary_alloc() return 4 byte aligned image offset, so that extable aligning formula in bpf_int_jit_compile() doesn't need to rely on internal implementation of bpf_jit_binary_alloc(). On x86 gcc defaults to 16-byte alignment for regular kernel functions due to better performance. JITed code may be aligned to 16 in the future, but it will use 4 in the meantime. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-10-ast@kernel.org --- kernel/bpf/core.c | 20 +++++++++++++++++++- kernel/bpf/verifier.c | 1 + kernel/extable.c | 2 ++ 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8a765bbd33f0..673f5d40a93e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -30,7 +30,7 @@ #include #include #include - +#include #include /* Registers */ @@ -712,6 +712,24 @@ bool is_bpf_text_address(unsigned long addr) return ret; } +const struct exception_table_entry *search_bpf_extables(unsigned long addr) +{ + const struct exception_table_entry *e = NULL; + struct bpf_prog *prog; + + rcu_read_lock(); + prog = bpf_prog_kallsyms_find(addr); + if (!prog) + goto out; + if (!prog->aux->num_exentries) + goto out; + + e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr); +out: + rcu_read_unlock(); + return e; +} + int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4b6a2cfcd47..fba9ef6a831b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8729,6 +8729,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return -EINVAL; } insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); + env->prog->aux->num_exentries++; continue; default: continue; diff --git a/kernel/extable.c b/kernel/extable.c index f6c9406eec7d..f6920a11e28a 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -56,6 +56,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) e = search_kernel_exception_table(addr); if (!e) e = search_module_extables(addr); + if (!e) + e = search_bpf_extables(addr); return e; } -- cgit v1.2.3 From a7658e1a4164ce2b9eb4a11aadbba38586e93bd6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:04 -0700 Subject: bpf: Check types of arguments passed into helpers Introduce new helper that reuses existing skb perf_event output implementation, but can be called from raw_tracepoint programs that receive 'struct sk_buff *' as tracepoint argument or can walk other kernel data structures to skb pointer. In order to do that teach verifier to resolve true C types of bpf helpers into in-kernel BTF ids. The type of kernel pointer passed by raw tracepoint into bpf program will be tracked by the verifier all the way until it's passed into helper function. For example: kfree_skb() kernel function calls trace_kfree_skb(skb, loc); bpf programs receives that skb pointer and may eventually pass it into bpf_skb_output() bpf helper which in-kernel is implemented via bpf_skb_event_output() kernel function. Its first argument in the kernel is 'struct sk_buff *'. The verifier makes sure that types match all the way. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-11-ast@kernel.org --- kernel/bpf/btf.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 44 ++++++++++++++++++++----------- kernel/trace/bpf_trace.c | 4 +++ 3 files changed, 101 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 271d27cd427f..f7557af39756 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3626,6 +3626,74 @@ again: return -EINVAL; } +u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) +{ + char fnname[KSYM_SYMBOL_LEN + 4] = "btf_"; + const struct btf_param *args; + const struct btf_type *t; + const char *tname, *sym; + u32 btf_id, i; + + if (IS_ERR(btf_vmlinux)) { + bpf_log(log, "btf_vmlinux is malformed\n"); + return -EINVAL; + } + + sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4); + if (!sym) { + bpf_log(log, "kernel doesn't have kallsyms\n"); + return -EFAULT; + } + + for (i = 1; i <= btf_vmlinux->nr_types; i++) { + t = btf_type_by_id(btf_vmlinux, i); + if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) + continue; + tname = __btf_name_by_offset(btf_vmlinux, t->name_off); + if (!strcmp(tname, fnname)) + break; + } + if (i > btf_vmlinux->nr_types) { + bpf_log(log, "helper %s type is not found\n", fnname); + return -ENOENT; + } + + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_ptr(t)) + return -EFAULT; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + return -EFAULT; + + args = (const struct btf_param *)(t + 1); + if (arg >= btf_type_vlen(t)) { + bpf_log(log, "bpf helper %s doesn't have %d-th argument\n", + fnname, arg); + return -EINVAL; + } + + t = btf_type_by_id(btf_vmlinux, args[arg].type); + if (!btf_type_is_ptr(t) || !t->type) { + /* anything but the pointer to struct is a helper config bug */ + bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n"); + return -EFAULT; + } + btf_id = t->type; + t = btf_type_by_id(btf_vmlinux, t->type); + /* skip modifiers */ + while (btf_type_is_modifier(t)) { + btf_id = t->type; + t = btf_type_by_id(btf_vmlinux, t->type); + } + if (!btf_type_is_struct(t)) { + bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n"); + return -EFAULT; + } + bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4, + arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off)); + return btf_id; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fba9ef6a831b..556e82f8869b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -205,6 +205,7 @@ struct bpf_call_arg_meta { u64 msize_umax_value; int ref_obj_id; int func_id; + u32 btf_id; }; struct btf *btf_vmlinux; @@ -3439,6 +3440,22 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_SOCKET; if (type != expected_type) goto err_type; + } else if (arg_type == ARG_PTR_TO_BTF_ID) { + expected_type = PTR_TO_BTF_ID; + if (type != expected_type) + goto err_type; + if (reg->btf_id != meta->btf_id) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value || reg->off) { + verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", + regno); + return -EACCES; + } } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -3586,6 +3603,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_PERF_EVENT_ARRAY: if (func_id != BPF_FUNC_perf_event_read && func_id != BPF_FUNC_perf_event_output && + func_id != BPF_FUNC_skb_output && func_id != BPF_FUNC_perf_event_read_value) goto error; break; @@ -3673,6 +3691,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: case BPF_FUNC_perf_event_read_value: + case BPF_FUNC_skb_output: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; @@ -4127,21 +4146,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ - err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta); - if (err) - return err; + for (i = 0; i < 5; i++) { + if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID) { + if (!fn->btf_id[i]) + fn->btf_id[i] = btf_resolve_helper_id(&env->log, fn->func, i); + meta.btf_id = fn->btf_id[i]; + } + err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); + if (err) + return err; + } err = record_func_map(env, &meta, func_id, insn_idx); if (err) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6221e8c6ecc3..52f7e9d8c29b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -995,6 +995,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +extern const struct bpf_func_proto bpf_skb_output_proto; + BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) { @@ -1053,6 +1055,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; + case BPF_FUNC_skb_output: + return &bpf_skb_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: -- cgit v1.2.3 From fc65104c7c89635ecacbff6cf5724ac24232d381 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 18 Oct 2019 11:18:42 +0800 Subject: dma-debug: Use pr_warn instead of pr_warning As said in commit f2c2cbcc35d4 ("powerpc: Use pr_warn instead of pr_warning"), removing pr_warning so all logging messages use a consistent _warn style. Let's do it. Link: http://lkml.kernel.org/r/20191018031850.48498-25-wangkefeng.wang@huawei.com To: linux-kernel@vger.kernel.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/dma/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 099002d84f46..a26170469543 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -161,7 +161,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry) { #ifdef CONFIG_STACKTRACE if (entry) { - pr_warning("Mapped at:\n"); + pr_warn("Mapped at:\n"); stack_trace_print(entry->stack_entries, entry->stack_len, 0); } #endif -- cgit v1.2.3 From 3da2e1fd46a726531a1e6d621bda62821a5e559a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 18 Oct 2019 11:18:43 +0800 Subject: trace: Use pr_warn instead of pr_warning As said in commit f2c2cbcc35d4 ("powerpc: Use pr_warn instead of pr_warning"), removing pr_warning so all logging messages use a consistent _warn style. Let's do it. Link: http://lkml.kernel.org/r/20191018031850.48498-26-wangkefeng.wang@huawei.com To: linux-kernel@vger.kernel.org Cc: Ingo Molnar Signed-off-by: Kefeng Wang Acked-by: Steven Rostedt (VMware) Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/trace/trace_benchmark.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 80e0b2aca703..2e9a4746ea85 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -178,14 +178,14 @@ static int benchmark_event_kthread(void *arg) int trace_benchmark_reg(void) { if (!ok_to_run) { - pr_warning("trace benchmark cannot be started via kernel command line\n"); + pr_warn("trace benchmark cannot be started via kernel command line\n"); return -EBUSY; } bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); if (IS_ERR(bm_event_thread)) { - pr_warning("trace benchmark failed to create kernel thread\n"); + pr_warn("trace benchmark failed to create kernel thread\n"); return PTR_ERR(bm_event_thread); } -- cgit v1.2.3 From c108e3c1bdbd0783d7c19ee80abb0591f79029e8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 17 Oct 2019 23:09:33 -0700 Subject: bpf: Fix bpf_attr.attach_btf_id check Only raw_tracepoint program type can have bpf_attr.attach_btf_id >= 0. Make sure to reject other program types that accidentally set it to non-zero. Fixes: ccfe29eb29c2 ("bpf: Add attach_btf_id attribute to program load") Reported-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191018060933.2950231-1-ast@kernel.org --- kernel/bpf/syscall.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 523e3ac15a08..16ea3c0db4f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1570,6 +1570,17 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, u32 btf_id) { + switch (prog_type) { + case BPF_PROG_TYPE_RAW_TRACEPOINT: + if (btf_id > BTF_MAX_TYPE) + return -EINVAL; + break; + default: + if (btf_id) + return -EINVAL; + break; + } + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { @@ -1610,13 +1621,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } - case BPF_PROG_TYPE_RAW_TRACEPOINT: - if (btf_id > BTF_MAX_TYPE) - return -EINVAL; - return 0; default: - if (btf_id) - return -EINVAL; return 0; } } -- cgit v1.2.3 From 1f5343c0ae9673543055e9794362766e1f0ed163 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 18 Oct 2019 17:03:44 +0800 Subject: bpf: Fix build error without CONFIG_NET If CONFIG_NET is n, building fails: kernel/trace/bpf_trace.o: In function `raw_tp_prog_func_proto': bpf_trace.c:(.text+0x1a34): undefined reference to `bpf_skb_output_proto' Wrap it into a #ifdef to fix this. Fixes: a7658e1a4164 ("bpf: Check types of arguments passed into helpers") Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20191018090344.26936-1-yuehaibing@huawei.com --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 52f7e9d8c29b..c3240898cc44 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1055,8 +1055,10 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; +#ifdef CONFIG_NET case BPF_FUNC_skb_output: return &bpf_skb_output_proto; +#endif case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: -- cgit v1.2.3 From b612e5df4587c934bd056bf05f4a1deca4de4f75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Oct 2019 12:45:37 +0200 Subject: clone3: add CLONE_CLEAR_SIGHAND Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL. Mutually exclusive with CLONE_SIGHAND to not disturb other thread's signal handler. In the spirit of closer cooperation between glibc developers and kernel developers (cf. [2]) this patchset came out of a discussion on the glibc mailing list for improving posix_spawn() (cf. [1], [3], [4]). Kernel support for this feature has been explicitly requested by glibc and I see no reason not to help them with this. The child helper process on Linux posix_spawn must ensure that no signal handlers are enabled, so the signal disposition must be either SIG_DFL or SIG_IGN. However, it requires a sigprocmask to obtain the current signal mask and at least _NSIG sigaction calls to reset the signal handlers for each posix_spawn call or complex state tracking that might lead to data corruption in glibc. Adding this flags lets glibc avoid these problems. [1]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00149.html [3]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00158.html [4]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00160.html [2]: https://lwn.net/Articles/799331/ '[...] by asking for better cooperation with the C-library projects in general. They should be copied on patches containing ABI changes, for example. I noted that there are often times where C-library developers wish the kernel community had done things differently; how could those be avoided in the future? Members of the audience suggested that more glibc developers should perhaps join the linux-api list. The other suggestion was to "copy Florian on everything".' Cc: Florian Weimer Cc: libc-alpha@sourceware.org Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191014104538.3096-1-christian.brauner@ubuntu.com --- kernel/fork.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ffa314838b43..954e875e72b1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1517,6 +1517,11 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(¤t->sighand->siglock); + + /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ + if (clone_flags & CLONE_CLEAR_SIGHAND) + flush_signal_handlers(tsk, 0); + return 0; } @@ -2619,11 +2624,8 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, static bool clone3_args_valid(const struct kernel_clone_args *kargs) { - /* - * All lower bits of the flag word are taken. - * Verify that no other unknown flags are passed along. - */ - if (kargs->flags & ~CLONE_LEGACY_FLAGS) + /* Verify that no unknown flags are passed along. */ + if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) return false; /* @@ -2633,6 +2635,10 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs) if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) return false; + if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == + (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) + return false; + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && kargs->exit_signal) return false; -- cgit v1.2.3 From a713af394cf382a30dd28a1015cbe572f1b9ca75 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 17 Oct 2019 02:50:01 +1100 Subject: cgroup: pids: use atomic64_t for pids->limit Because pids->limit can be changed concurrently (but we don't want to take a lock because it would be needlessly expensive), use atomic64_ts instead. Fixes: commit 49b786ea146f ("cgroup: implement the PIDs subsystem") Cc: stable@vger.kernel.org # v4.3+ Signed-off-by: Aleksa Sarai Signed-off-by: Tejun Heo --- kernel/cgroup/pids.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 8e513a573fe9..138059eb730d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -45,7 +45,7 @@ struct pids_cgroup { * %PIDS_MAX = (%PID_MAX_LIMIT + 1). */ atomic64_t counter; - int64_t limit; + atomic64_t limit; /* Handle for "pids.events" */ struct cgroup_file events_file; @@ -73,8 +73,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - pids->limit = PIDS_MAX; atomic64_set(&pids->counter, 0); + atomic64_set(&pids->limit, PIDS_MAX); atomic64_set(&pids->events_limit, 0); return &pids->css; } @@ -146,13 +146,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); + int64_t limit = atomic64_read(&p->limit); /* * Since new is capped to the maximum number of pid_t, if * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > p->limit) + if (new > limit) goto revert; } @@ -277,7 +278,7 @@ set_limit: * Limit updates don't need to be mutex'd, since it isn't * critical that any racing fork()s follow the new limit. */ - pids->limit = limit; + atomic64_set(&pids->limit, limit); return nbytes; } @@ -285,7 +286,7 @@ static int pids_max_show(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); struct pids_cgroup *pids = css_pids(css); - int64_t limit = pids->limit; + int64_t limit = atomic64_read(&pids->limit); if (limit >= PIDS_MAX) seq_printf(sf, "%s\n", PIDS_MAX_STR); -- cgit v1.2.3 From 3820729160440158a014add69cc0d371061a96b2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Oct 2019 17:18:11 -0700 Subject: bpf: Prepare btf_ctx_access for non raw_tp use case This patch makes a few changes to btf_ctx_access() to prepare it for non raw_tp use case where the attach_btf_id is not necessary a BTF_KIND_TYPEDEF. It moves the "btf_trace_" prefix check and typedef-follow logic to a new function "check_attach_btf_id()" which is called only once during bpf_check(). btf_ctx_access() only operates on a BTF_KIND_FUNC_PROTO type now. That should also be more efficient since it is done only one instead of every-time check_ctx_access() is called. "check_attach_btf_id()" needs to find the func_proto type from the attach_btf_id. It needs to store the result into the newly added prog->aux->attach_func_proto. func_proto btf type has no name, so a proper name should be stored into "attach_func_name" also. v2: - Move the "btf_trace_" check to an earlier verifier phase (Alexei) Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191025001811.1718491-1-kafai@fb.com --- kernel/bpf/btf.c | 73 +++++++++--------------------------------------- kernel/bpf/syscall.c | 4 +-- kernel/bpf/verifier.c | 52 +++++++++++++++++++++++++++++++++- kernel/trace/bpf_trace.c | 2 ++ 4 files changed, 67 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7557af39756..128d89601d73 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -336,16 +336,6 @@ static bool btf_type_is_fwd(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; } -static bool btf_type_is_func(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; -} - -static bool btf_type_is_func_proto(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; -} - static bool btf_type_nosize(const struct btf_type *t) { return btf_type_is_void(t) || btf_type_is_fwd(t) || @@ -377,16 +367,6 @@ static bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static bool btf_type_is_ptr(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; -} - -static bool btf_type_is_int(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_INT; -} - static bool btf_type_is_var(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; @@ -3442,54 +3422,27 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + const struct btf_type *t = prog->aux->attach_func_proto; + const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; - u32 btf_id = prog->aux->attach_btf_id; const struct btf_param *args; - const struct btf_type *t; - const char prefix[] = "btf_trace_"; - const char *tname; u32 nr_args, arg; - if (!btf_id) - return true; - - if (IS_ERR(btf_vmlinux)) { - bpf_log(log, "btf_vmlinux is malformed\n"); - return false; - } - - t = btf_type_by_id(btf_vmlinux, btf_id); - if (!t || BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { - bpf_log(log, "btf_id is invalid\n"); - return false; - } - - tname = __btf_name_by_offset(btf_vmlinux, t->name_off); - if (strncmp(prefix, tname, sizeof(prefix) - 1)) { - bpf_log(log, "btf_id points to wrong type name %s\n", tname); - return false; - } - tname += sizeof(prefix) - 1; - - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_ptr(t)) - return false; - t = btf_type_by_id(btf_vmlinux, t->type); - if (!btf_type_is_func_proto(t)) - return false; - if (off % 8) { - bpf_log(log, "raw_tp '%s' offset %d is not multiple of 8\n", + bpf_log(log, "func '%s' offset %d is not multiple of 8\n", tname, off); return false; } arg = off / 8; args = (const struct btf_param *)(t + 1); - /* skip first 'void *__data' argument in btf_trace_##name typedef */ - args++; - nr_args = btf_type_vlen(t) - 1; + nr_args = btf_type_vlen(t); + if (prog->aux->attach_btf_trace) { + /* skip first 'void *__data' argument in btf_trace_##name typedef */ + args++; + nr_args--; + } if (arg >= nr_args) { - bpf_log(log, "raw_tp '%s' doesn't have %d-th argument\n", + bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg); return false; } @@ -3503,7 +3456,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; if (!btf_type_is_ptr(t)) { bpf_log(log, - "raw_tp '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", + "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, __btf_name_by_offset(btf_vmlinux, t->name_off), btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -3526,11 +3479,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, t = btf_type_by_id(btf_vmlinux, t->type); if (!btf_type_is_struct(t)) { bpf_log(log, - "raw_tp '%s' arg%d type %s is not a struct\n", + "func '%s' arg%d type %s is not a struct\n", tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]); return false; } - bpf_log(log, "raw_tp '%s' arg%d has btf_id %d type %s '%s'\n", + bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], __btf_name_by_offset(btf_vmlinux, t->name_off)); return true; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 16ea3c0db4f6..ff5225759553 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1848,9 +1848,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) goto out_put_prog; } /* raw_tp name is taken from type name instead */ - tp_name = kernel_type_name(prog->aux->attach_btf_id); - /* skip the prefix */ - tp_name += sizeof("btf_trace_") - 1; + tp_name = prog->aux->attach_func_name; } else { if (strncpy_from_user(buf, u64_to_user_ptr(attr->raw_tracepoint.name), diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 556e82f8869b..c59778c0fc4d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9372,6 +9372,52 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +static int check_attach_btf_id(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + u32 btf_id = prog->aux->attach_btf_id; + const struct btf_type *t; + const char *tname; + + if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && btf_id) { + const char prefix[] = "btf_trace_"; + + t = btf_type_by_id(btf_vmlinux, btf_id); + if (!t) { + verbose(env, "attach_btf_id %u is invalid\n", btf_id); + return -EINVAL; + } + if (!btf_type_is_typedef(t)) { + verbose(env, "attach_btf_id %u is not a typedef\n", + btf_id); + return -EINVAL; + } + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + if (!tname || strncmp(prefix, tname, sizeof(prefix) - 1)) { + verbose(env, "attach_btf_id %u points to wrong type name %s\n", + btf_id, tname); + return -EINVAL; + } + tname += sizeof(prefix) - 1; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_ptr(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + + /* remember two read only pointers that are valid for + * the life time of the kernel + */ + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + prog->aux->attach_btf_trace = true; + } + return 0; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -9435,9 +9481,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, /* Either gcc or pahole or kernel are broken. */ verbose(env, "in-kernel BTF is malformed\n"); ret = PTR_ERR(btf_vmlinux); - goto err_unlock; + goto skip_full_check; } + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c3240898cc44..571c25d60710 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1080,6 +1080,8 @@ static bool raw_tp_prog_is_valid_access(int off, int size, return false; if (off % size != 0) return false; + if (!prog->aux->attach_btf_id) + return true; return btf_ctx_access(off, size, type, prog, info); } -- cgit v1.2.3 From 5153faac18d293fc7abb19ff7034683fbcd82dc7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Oct 2019 12:03:51 -0700 Subject: cgroup: remove cgroup_enable_task_cg_lists() optimization cgroup_enable_task_cg_lists() is used to lazyily initialize task cgroup associations on the first use to reduce fork / exit overheads on systems which don't use cgroup. Unfortunately, locking around it has never been actually correct and its value is dubious given how the vast majority of systems use cgroup right away from boot. This patch removes the optimization. For now, replace the cg_list based branches with WARN_ON_ONCE()'s to be on the safe side. We can simplify the logic further in the future. Signed-off-by: Tejun Heo Reported-by: Oleg Nesterov Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 184 +++++++++++-------------------------------------- kernel/cgroup/cpuset.c | 2 - 2 files changed, 39 insertions(+), 147 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8b1c4fd47a7a..cf32c0c7a45d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1883,65 +1883,6 @@ static int cgroup_reconfigure(struct fs_context *fc) return 0; } -/* - * To reduce the fork() overhead for systems that are not actually using - * their cgroups capability, we don't maintain the lists running through - * each css_set to its tasks until we see the list actually used - in other - * words after the first mount. - */ -static bool use_task_css_set_links __read_mostly; - -void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - - /* - * We need tasklist_lock because RCU is not safe against - * while_each_thread(). Besides, a forking task that has passed - * cgroup_post_fork() without seeing use_task_css_set_links = 1 - * is not guaranteed to have its child immediately visible in the - * tasklist if we walk through it with RCU. - */ - read_lock(&tasklist_lock); - spin_lock_irq(&css_set_lock); - - if (use_task_css_set_links) - goto out_unlock; - - use_task_css_set_links = true; - - do_each_thread(g, p) { - WARN_ON_ONCE(!list_empty(&p->cg_list) || - task_css_set(p) != &init_css_set); - - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - * Do it while holding siglock so that we don't end up - * racing against cgroup_exit(). - * - * Interrupts were already disabled while acquiring - * the css_set_lock, so we do not need to disable it - * again when acquiring the sighand->siglock here. - */ - spin_lock(&p->sighand->siglock); - if (!(p->flags & PF_EXITING)) { - struct css_set *cset = task_css_set(p); - - if (!css_set_populated(cset)) - css_set_update_populated(cset, true); - list_add_tail(&p->cg_list, &cset->tasks); - get_css_set(cset); - cset->nr_tasks++; - } - spin_unlock(&p->sighand->siglock); - } while_each_thread(g, p); -out_unlock: - spin_unlock_irq(&css_set_lock); - read_unlock(&tasklist_lock); -} - static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -2187,13 +2128,6 @@ static int cgroup_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; - /* - * The first time anyone tries to mount a cgroup, enable the list - * linking each css_set to its tasks and fix up all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); - ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); fc->fs_private = &ctx->kfc; @@ -2371,9 +2305,8 @@ static void cgroup_migrate_add_task(struct task_struct *task, if (task->flags & PF_EXITING) return; - /* leave @task alone if post_fork() hasn't linked it yet */ - if (list_empty(&task->cg_list)) - return; + /* cgroup_threadgroup_rwsem protects racing against forks */ + WARN_ON_ONCE(list_empty(&task->cg_list)); cset = task_css_set(task); if (!cset->mg_src_cgrp) @@ -4586,9 +4519,6 @@ repeat: void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { - /* no one should try to iterate before mounting cgroups */ - WARN_ON_ONCE(!use_task_css_set_links); - memset(it, 0, sizeof(*it)); spin_lock_irq(&css_set_lock); @@ -6022,62 +5952,38 @@ void cgroup_cancel_fork(struct task_struct *child) void cgroup_post_fork(struct task_struct *child) { struct cgroup_subsys *ss; + struct css_set *cset; int i; + spin_lock_irq(&css_set_lock); + + WARN_ON_ONCE(!list_empty(&child->cg_list)); + cset = task_css_set(current); /* current is @child's parent */ + get_css_set(cset); + cset->nr_tasks++; + css_set_move_task(child, NULL, cset, false); + /* - * This may race against cgroup_enable_task_cg_lists(). As that - * function sets use_task_css_set_links before grabbing - * tasklist_lock and we just went through tasklist_lock to add - * @child, it's guaranteed that either we see the set - * use_task_css_set_links or cgroup_enable_task_cg_lists() sees - * @child during its iteration. - * - * If we won the race, @child is associated with %current's - * css_set. Grabbing css_set_lock guarantees both that the - * association is stable, and, on completion of the parent's - * migration, @child is visible in the source of migration or - * already in the destination cgroup. This guarantee is necessary - * when implementing operations which need to migrate all tasks of - * a cgroup to another. - * - * Note that if we lose to cgroup_enable_task_cg_lists(), @child - * will remain in init_css_set. This is safe because all tasks are - * in the init_css_set before cg_links is enabled and there's no - * operation which transfers all tasks out of init_css_set. + * If the cgroup has to be frozen, the new task has too. Let's set + * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the + * frozen state. */ - if (use_task_css_set_links) { - struct css_set *cset; - - spin_lock_irq(&css_set_lock); - cset = task_css_set(current); /* current is @child's parent */ - if (list_empty(&child->cg_list)) { - get_css_set(cset); - cset->nr_tasks++; - css_set_move_task(child, NULL, cset, false); - } + if (unlikely(cgroup_task_freeze(child))) { + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); /* - * If the cgroup has to be frozen, the new task has too. - * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get - * the task into the frozen state. + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later from + * do_freezer_trap(). So we avoid cgroup's transient switch + * from the frozen state and back. */ - if (unlikely(cgroup_task_freeze(child))) { - spin_lock(&child->sighand->siglock); - WARN_ON_ONCE(child->frozen); - child->jobctl |= JOBCTL_TRAP_FREEZE; - spin_unlock(&child->sighand->siglock); - - /* - * Calling cgroup_update_frozen() isn't required here, - * because it will be called anyway a bit later - * from do_freezer_trap(). So we avoid cgroup's - * transient switch from the frozen state and back. - */ - } - - spin_unlock_irq(&css_set_lock); } + spin_unlock_irq(&css_set_lock); + /* * Call ss->fork(). This must happen after @child is linked on * css_set; otherwise, @child might change state between ->fork() @@ -6101,29 +6007,19 @@ void cgroup_exit(struct task_struct *tsk) struct css_set *cset; int i; - /* - * Unlink from @tsk from its css_set. As migration path can't race - * with us (thanks to cgroup_threadgroup_rwsem), we can check css_set - * and cg_list without synchronization. - */ - cset = task_css_set(tsk); + spin_lock_irq(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - spin_lock_irq(&css_set_lock); - css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); - cset->nr_tasks--; + WARN_ON_ONCE(list_empty(&tsk->cg_list)); + cset = task_css_set(tsk); + css_set_move_task(tsk, cset, NULL, false); + list_add_tail(&tsk->cg_list, &cset->dying_tasks); + cset->nr_tasks--; - WARN_ON_ONCE(cgroup_task_frozen(tsk)); - if (unlikely(cgroup_task_freeze(tsk))) - cgroup_update_frozen(task_dfl_cgroup(tsk)); + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); - } else { - /* Take reference to avoid freeing init_css_set in cgroup_free, - * see cgroup_fork(). */ - get_css_set(cset); - } + spin_unlock_irq(&css_set_lock); /* see cgroup_post_fork() for details */ do_each_subsys_mask(ss, i, have_exit_callback) { @@ -6140,12 +6036,10 @@ void cgroup_release(struct task_struct *task) ss->release(task); } while_each_subsys_mask(); - if (use_task_css_set_links) { - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); - } + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); } void cgroup_free(struct task_struct *task) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c52bc91f882b..faff8f99e8f2 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -928,8 +928,6 @@ static void rebuild_root_domains(void) lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); - cgroup_enable_task_cg_lists(); - rcu_read_lock(); /* -- cgit v1.2.3 From 771b53d033e8663abdf59704806aa856b236dcdb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Oct 2019 10:25:58 -0600 Subject: io-wq: small threadpool implementation for io_uring This adds support for io-wq, a smaller and specialized thread pool implementation. This is meant to replace workqueues for io_uring. Among the reasons for this addition are: - We can assign memory context smarter and more persistently if we manage the life time of threads. - We can drop various work-arounds we have in io_uring, like the async_list. - We can implement hashed work insertion, to manage concurrency of buffered writes without needing a) an extra workqueue, or b) needlessly making the concurrency of said workqueue very low which hurts performance of multiple buffered file writers. - We can implement cancel through signals, for cancelling interruptible work like read/write (or send/recv) to/from sockets. - We need the above cancel for being able to assign and use file tables from a process. - We can implement a more thorough cancel operation in general. - We need it to move towards a syslet/threadlet model for even faster async execution. For that we need to take ownership of the used threads. This list is just off the top of my head. Performance should be the same, or better, at least that's what I've seen in my testing. io-wq supports basic NUMA functionality, setting up a pool per node. io-wq hooks up to the scheduler schedule in/out just like workqueue and uses that to drive the need for more/less workers. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Jens Axboe --- kernel/sched/core.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd05a378631a..a95a2f05f3b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,7 @@ #include #include "../workqueue_internal.h" +#include "../../fs/io-wq.h" #include "../smpboot.h" #include "pelt.h" @@ -4103,9 +4104,12 @@ static inline void sched_submit_work(struct task_struct *tsk) * we disable preemption to avoid it calling schedule() again * in the possible wakeup of a kworker. */ - if (tsk->flags & PF_WQ_WORKER) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); - wq_worker_sleeping(tsk); + if (tsk->flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else + io_wq_worker_sleeping(tsk); preempt_enable_no_resched(); } @@ -4122,8 +4126,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & PF_WQ_WORKER) - wq_worker_running(tsk); + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); + else + io_wq_worker_running(tsk); + } } asmlinkage __visible void __sched schedule(void) -- cgit v1.2.3 From 15ab09bdca616538597fe4d2eb4db3c2d28716ba Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 28 Oct 2019 20:24:26 -0700 Subject: bpf: Enforce 'return 0' in BTF-enabled raw_tp programs The return value of raw_tp programs is ignored by __bpf_trace_run() that calls them. The verifier also allows any value to be returned. For BTF-enabled raw_tp lets enforce 'return 0', so that return value can be used for something in the future. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191029032426.1206762-1-ast@kernel.org --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c59778c0fc4d..6b0de04f8b91 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6279,6 +6279,11 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_CGROUP_SOCKOPT: break; + case BPF_PROG_TYPE_RAW_TRACEPOINT: + if (!env->prog->aux->attach_btf_id) + return 0; + range = tnum_const(0); + break; default: return 0; } -- cgit v1.2.3 From af91acbc62999d62e2ca1e80f660d20561ca55d3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Oct 2019 16:30:19 -0700 Subject: bpf: Fix bpf jit kallsym access Jiri reported crash when JIT is on, but net.core.bpf_jit_kallsyms is off. bpf_prog_kallsyms_find() was skipping addr->bpf_prog resolution logic in oops and stack traces. That's incorrect. It should only skip addr->name resolution for 'cat /proc/kallsyms'. That's what bpf_jit_kallsyms and bpf_jit_harden protect. Fixes: 3dec541b2e63 ("bpf: Add support for BTF pointers to x86 JIT") Reported-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191030233019.1187404-1-ast@kernel.org --- kernel/bpf/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 673f5d40a93e..8d3fbc86ca5e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -668,9 +668,6 @@ static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) { struct latch_tree_node *n; - if (!bpf_jit_kallsyms_enabled()) - return NULL; - n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); return n ? container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : -- cgit v1.2.3 From f1b9509c2fb0ef4db8d22dac9aef8e856a5d81f6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Oct 2019 15:32:11 -0700 Subject: bpf: Replace prog_raw_tp+btf_id with prog_tracing The bpf program type raw_tp together with 'expected_attach_type' was the most appropriate api to indicate BTF-enabled raw_tp programs. But during development it became apparent that 'expected_attach_type' cannot be used and new 'attach_btf_id' field had to be introduced. Which means that the information is duplicated in two fields where one of them is ignored. Clean it up by introducing new program type where both 'expected_attach_type' and 'attach_btf_id' fields have specific meaning. In the future 'expected_attach_type' will be extended with other attach points that have similar semantics to raw_tp. This patch is replacing BTF-enabled BPF_PROG_TYPE_RAW_TRACEPOINT with prog_type = BPF_RPOG_TYPE_TRACING expected_attach_type = BPF_TRACE_RAW_TP attach_btf_id = btf_id of raw tracepoint inside the kernel Future patches will add expected_attach_type = BPF_TRACE_FENTRY or BPF_TRACE_FEXIT where programs have the same input context and the same helpers, but different attach points. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191030223212.953010-2-ast@kernel.org --- kernel/bpf/syscall.c | 6 +++--- kernel/bpf/verifier.c | 34 ++++++++++++++++++++++++---------- kernel/trace/bpf_trace.c | 44 ++++++++++++++++++++++++++++++++++++-------- 3 files changed, 63 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ff5225759553..985d01ced196 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1571,7 +1571,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, u32 btf_id) { switch (prog_type) { - case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_TRACING: if (btf_id > BTF_MAX_TYPE) return -EINVAL; break; @@ -1833,13 +1833,13 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) return PTR_ERR(prog); if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { err = -EINVAL; goto out_put_prog; } - if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && - prog->aux->attach_btf_id) { + if (prog->type == BPF_PROG_TYPE_TRACING) { if (attr->raw_tracepoint.name) { /* raw_tp name should not be specified in raw_tp * programs that were verified via in-kernel BTF info diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6b0de04f8b91..2f2374967b36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9381,24 +9381,36 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; u32 btf_id = prog->aux->attach_btf_id; + const char prefix[] = "btf_trace_"; const struct btf_type *t; const char *tname; - if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && btf_id) { - const char prefix[] = "btf_trace_"; + if (prog->type != BPF_PROG_TYPE_TRACING) + return 0; - t = btf_type_by_id(btf_vmlinux, btf_id); - if (!t) { - verbose(env, "attach_btf_id %u is invalid\n", btf_id); - return -EINVAL; - } + if (!btf_id) { + verbose(env, "Tracing programs must provide btf_id\n"); + return -EINVAL; + } + t = btf_type_by_id(btf_vmlinux, btf_id); + if (!t) { + verbose(env, "attach_btf_id %u is invalid\n", btf_id); + return -EINVAL; + } + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + if (!tname) { + verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id); + return -EINVAL; + } + + switch (prog->expected_attach_type) { + case BPF_TRACE_RAW_TP: if (!btf_type_is_typedef(t)) { verbose(env, "attach_btf_id %u is not a typedef\n", btf_id); return -EINVAL; } - tname = btf_name_by_offset(btf_vmlinux, t->name_off); - if (!tname || strncmp(prefix, tname, sizeof(prefix) - 1)) { + if (strncmp(prefix, tname, sizeof(prefix) - 1)) { verbose(env, "attach_btf_id %u points to wrong type name %s\n", btf_id, tname); return -EINVAL; @@ -9419,8 +9431,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_name = tname; prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; + return 0; + default: + return -EINVAL; } - return 0; } int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 571c25d60710..f50bf19f7a05 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1055,10 +1055,6 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; -#ifdef CONFIG_NET - case BPF_FUNC_skb_output: - return &bpf_skb_output_proto; -#endif case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: @@ -1068,20 +1064,44 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { +#ifdef CONFIG_NET + case BPF_FUNC_skb_output: + return &bpf_skb_output_proto; +#endif + default: + return raw_tp_prog_func_proto(func_id, prog); + } +} + static bool raw_tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - /* largest tracepoint in the kernel has 12 args */ - if (off < 0 || off >= sizeof(__u64) * 12) + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +static bool tracing_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; - if (!prog->aux->attach_btf_id) - return true; return btf_ctx_access(off, size, type, prog, info); } @@ -1093,6 +1113,14 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; +const struct bpf_verifier_ops tracing_verifier_ops = { + .get_func_proto = tracing_prog_func_proto, + .is_valid_access = tracing_prog_is_valid_access, +}; + +const struct bpf_prog_ops tracing_prog_ops = { +}; + static bool raw_tp_writable_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, -- cgit v1.2.3 From 8b5369ea580964dbc982781bfb9fb93459fc5e8d Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Mon, 14 Oct 2019 20:31:03 +0200 Subject: dma/direct: turn ARCH_ZONE_DMA_BITS into a variable Some architectures, notably ARM, are interested in tweaking this depending on their runtime DMA addressing limitations. Acked-by: Christoph Hellwig Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Catalin Marinas --- kernel/dma/direct.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8402b29c280f..0b67c04e531b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -16,12 +16,11 @@ #include /* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but - * some use it for entirely different regions: + * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it + * it for entirely different regions. In that case the arch code needs to + * override the variable below for dma-direct to work properly. */ -#ifndef ARCH_ZONE_DMA_BITS -#define ARCH_ZONE_DMA_BITS 24 -#endif +unsigned int zone_dma_bits __ro_after_init = 24; static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) { @@ -69,7 +68,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding * zones. */ - if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + if (*phys_mask <= DMA_BIT_MASK(zone_dma_bits)) return GFP_DMA; if (*phys_mask <= DMA_BIT_MASK(32)) return GFP_DMA32; @@ -395,7 +394,7 @@ int dma_direct_supported(struct device *dev, u64 mask) u64 min_mask; if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS); + min_mask = DMA_BIT_MASK(zone_dma_bits); else min_mask = DMA_BIT_MASK(32); -- cgit v1.2.3 From 7e35e4eb7e56233dcf445992d7b835a9ba93408e Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 30 Oct 2019 16:43:09 +0100 Subject: livepatch: Keep replaced patches until post_patch callback is called Pre/post (un)patch callbacks might manipulate the system state. Cumulative livepatches might need to take over the changes made by the replaced ones. For this they might need to access some data stored or referenced by the old livepatches. Therefore the replaced livepatches have to stay around until post_patch() callback is called. It is achieved by calling the free functions later. It is the same location where disabled livepatches have already been freed. Link: http://lkml.kernel.org/r/20191030154313.13263-2-pmladek@suse.com To: Jiri Kosina Cc: Kamalesh Babulal Cc: Nicolai Stange Cc: live-patching@vger.kernel.org Cc: linux-kernel@vger.kernel.org Acked-by: Miroslav Benes Acked-by: Joe Lawrence Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- kernel/livepatch/core.c | 36 ++++++++++++++++++++++++++---------- kernel/livepatch/core.h | 5 +++-- kernel/livepatch/transition.c | 12 ++++++------ 3 files changed, 35 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index ab4a4606d19b..1e1d87ead55c 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -632,7 +632,7 @@ static void klp_free_objects_dynamic(struct klp_patch *patch) * The operation must be completed by calling klp_free_patch_finish() * outside klp_mutex. */ -void klp_free_patch_start(struct klp_patch *patch) +static void klp_free_patch_start(struct klp_patch *patch) { if (!list_empty(&patch->list)) list_del(&patch->list); @@ -677,6 +677,23 @@ static void klp_free_patch_work_fn(struct work_struct *work) klp_free_patch_finish(patch); } +void klp_free_patch_async(struct klp_patch *patch) +{ + klp_free_patch_start(patch); + schedule_work(&patch->free_work); +} + +void klp_free_replaced_patches_async(struct klp_patch *new_patch) +{ + struct klp_patch *old_patch, *tmp_patch; + + klp_for_each_patch_safe(old_patch, tmp_patch) { + if (old_patch == new_patch) + return; + klp_free_patch_async(old_patch); + } +} + static int klp_init_func(struct klp_object *obj, struct klp_func *func) { if (!func->old_name) @@ -1022,12 +1039,13 @@ err: EXPORT_SYMBOL_GPL(klp_enable_patch); /* - * This function removes replaced patches. + * This function unpatches objects from the replaced livepatches. * * We could be pretty aggressive here. It is called in the situation where - * these structures are no longer accessible. All functions are redirected - * by the klp_transition_patch. They use either a new code or they are in - * the original code because of the special nop function patches. + * these structures are no longer accessed from the ftrace handler. + * All functions are redirected by the klp_transition_patch. They + * use either a new code or they are in the original code because + * of the special nop function patches. * * The only exception is when the transition was forced. In this case, * klp_ftrace_handler() might still see the replaced patch on the stack. @@ -1035,18 +1053,16 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * thanks to RCU. We only have to keep the patches on the system. Also * this is handled transparently by patch->module_put. */ -void klp_discard_replaced_patches(struct klp_patch *new_patch) +void klp_unpatch_replaced_patches(struct klp_patch *new_patch) { - struct klp_patch *old_patch, *tmp_patch; + struct klp_patch *old_patch; - klp_for_each_patch_safe(old_patch, tmp_patch) { + klp_for_each_patch(old_patch) { if (old_patch == new_patch) return; old_patch->enabled = false; klp_unpatch_objects(old_patch); - klp_free_patch_start(old_patch); - schedule_work(&old_patch->free_work); } } diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index ec43a40b853f..38209c7361b6 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -13,8 +13,9 @@ extern struct list_head klp_patches; #define klp_for_each_patch(patch) \ list_for_each_entry(patch, &klp_patches, list) -void klp_free_patch_start(struct klp_patch *patch); -void klp_discard_replaced_patches(struct klp_patch *new_patch); +void klp_free_patch_async(struct klp_patch *patch); +void klp_free_replaced_patches_async(struct klp_patch *new_patch); +void klp_unpatch_replaced_patches(struct klp_patch *new_patch); void klp_discard_nops(struct klp_patch *new_patch); static inline bool klp_is_object_loaded(struct klp_object *obj) diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index cdf318d86dd6..f6310f848f34 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -78,7 +78,7 @@ static void klp_complete_transition(void) klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) { - klp_discard_replaced_patches(klp_transition_patch); + klp_unpatch_replaced_patches(klp_transition_patch); klp_discard_nops(klp_transition_patch); } @@ -446,14 +446,14 @@ void klp_try_complete_transition(void) klp_complete_transition(); /* - * It would make more sense to free the patch in + * It would make more sense to free the unused patches in * klp_complete_transition() but it is called also * from klp_cancel_transition(). */ - if (!patch->enabled) { - klp_free_patch_start(patch); - schedule_work(&patch->free_work); - } + if (!patch->enabled) + klp_free_patch_async(patch); + else if (patch->replace) + klp_free_replaced_patches_async(patch); } /* -- cgit v1.2.3 From 73727f4dafa2df107e85753c5ab703a1f344e1f1 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 30 Oct 2019 16:43:10 +0100 Subject: livepatch: Basic API to track system state changes This is another step how to help maintaining more livepatches. One big help was the atomic replace and cumulative livepatches. These livepatches replace the already installed ones. Therefore it should be enough when each cumulative livepatch is consistent. The problems might come with shadow variables and callbacks. They might change the system behavior or state so that it is no longer safe to go back and use an older livepatch or the original kernel code. Also, a new livepatch must be able to detect changes which were made by the already installed livepatches. This is where the livepatch system state tracking gets useful. It allows to: - find whether a system state has already been modified by previous livepatches - store data needed to manipulate and restore the system state The information about the manipulated system states is stored in an array of struct klp_state. It can be searched by two new functions klp_get_state() and klp_get_prev_state(). The dependencies are going to be solved by a version field added later. The only important information is that it will be allowed to modify the same state by more non-cumulative livepatches. It is similar to allowing to modify the same function several times. The livepatch author is responsible for preventing incompatible changes. Link: http://lkml.kernel.org/r/20191030154313.13263-3-pmladek@suse.com To: Jiri Kosina Cc: Kamalesh Babulal Cc: Nicolai Stange Cc: live-patching@vger.kernel.org Cc: linux-kernel@vger.kernel.org Acked-by: Miroslav Benes Acked-by: Joe Lawrence Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- kernel/livepatch/Makefile | 2 +- kernel/livepatch/state.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 kernel/livepatch/state.c (limited to 'kernel') diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index cf9b5bcdb952..cf03d4bdfc66 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_LIVEPATCH) += livepatch.o -livepatch-objs := core.o patch.o shadow.o transition.o +livepatch-objs := core.o patch.o shadow.o state.o transition.o diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c new file mode 100644 index 000000000000..6ab15b642c0a --- /dev/null +++ b/kernel/livepatch/state.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * system_state.c - State of the system modified by livepatches + * + * Copyright (C) 2019 SUSE + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include "core.h" +#include "transition.h" + +#define klp_for_each_state(patch, state) \ + for (state = patch->states; state && state->id; state++) + +/** + * klp_get_state() - get information about system state modified by + * the given patch + * @patch: livepatch that modifies the given system state + * @id: custom identifier of the modified system state + * + * Checks whether the given patch modifies the given system state. + * + * The function can be called either from pre/post (un)patch + * callbacks or from the kernel code added by the livepatch. + * + * Return: pointer to struct klp_state when found, otherwise NULL. + */ +struct klp_state *klp_get_state(struct klp_patch *patch, unsigned long id) +{ + struct klp_state *state; + + klp_for_each_state(patch, state) { + if (state->id == id) + return state; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(klp_get_state); + +/** + * klp_get_prev_state() - get information about system state modified by + * the already installed livepatches + * @id: custom identifier of the modified system state + * + * Checks whether already installed livepatches modify the given + * system state. + * + * The same system state can be modified by more non-cumulative + * livepatches. It is expected that the latest livepatch has + * the most up-to-date information. + * + * The function can be called only during transition when a new + * livepatch is being enabled or when such a transition is reverted. + * It is typically called only from from pre/post (un)patch + * callbacks. + * + * Return: pointer to the latest struct klp_state from already + * installed livepatches, NULL when not found. + */ +struct klp_state *klp_get_prev_state(unsigned long id) +{ + struct klp_patch *patch; + struct klp_state *state, *last_state = NULL; + + if (WARN_ON_ONCE(!klp_transition_patch)) + return NULL; + + klp_for_each_patch(patch) { + if (patch == klp_transition_patch) + goto out; + + state = klp_get_state(patch, id); + if (state) + last_state = state; + } + +out: + return last_state; +} +EXPORT_SYMBOL_GPL(klp_get_prev_state); -- cgit v1.2.3 From 92c9abf5e57500ea7dc59a55273aa7850b631bda Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 30 Oct 2019 16:43:11 +0100 Subject: livepatch: Allow to distinguish different version of system state changes The atomic replace runs pre/post (un)install callbacks only from the new livepatch. There are several reasons for this: + Simplicity: clear ordering of operations, no interactions between old and new callbacks. + Reliability: only new livepatch knows what changes can already be made by older livepatches and how to take over the state. + Testing: the atomic replace can be properly tested only when a newer livepatch is available. It might be too late to fix unwanted effect of callbacks from older livepatches. It might happen that an older change is not enough and the same system state has to be modified another way. Different changes need to get distinguished by a version number added to struct klp_state. The version can also be used to prevent loading incompatible livepatches. The check is done when the livepatch is enabled. The rules are: + Any completely new system state modification is allowed. + System state modifications with the same or higher version are allowed for already modified system states. + Cumulative livepatches must handle all system state modifications from already installed livepatches. + Non-cumulative livepatches are allowed to touch already modified system states. Link: http://lkml.kernel.org/r/20191030154313.13263-4-pmladek@suse.com To: Jiri Kosina Cc: Kamalesh Babulal Cc: Nicolai Stange Cc: live-patching@vger.kernel.org Cc: linux-kernel@vger.kernel.org Acked-by: Miroslav Benes Acked-by: Joe Lawrence Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- kernel/livepatch/core.c | 8 ++++++++ kernel/livepatch/state.c | 36 ++++++++++++++++++++++++++++++++++++ kernel/livepatch/state.h | 9 +++++++++ 3 files changed, 53 insertions(+) create mode 100644 kernel/livepatch/state.h (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 1e1d87ead55c..c3512e7e0801 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -22,6 +22,7 @@ #include #include "core.h" #include "patch.h" +#include "state.h" #include "transition.h" /* @@ -1009,6 +1010,13 @@ int klp_enable_patch(struct klp_patch *patch) mutex_lock(&klp_mutex); + if (!klp_is_patch_compatible(patch)) { + pr_err("Livepatch patch (%s) is not compatible with the already installed livepatches.\n", + patch->mod->name); + mutex_unlock(&klp_mutex); + return -EINVAL; + } + ret = klp_init_patch_early(patch); if (ret) { mutex_unlock(&klp_mutex); diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c index 6ab15b642c0a..7ee19476de9d 100644 --- a/kernel/livepatch/state.c +++ b/kernel/livepatch/state.c @@ -9,6 +9,7 @@ #include #include "core.h" +#include "state.h" #include "transition.h" #define klp_for_each_state(patch, state) \ @@ -81,3 +82,38 @@ out: return last_state; } EXPORT_SYMBOL_GPL(klp_get_prev_state); + +/* Check if the patch is able to deal with the existing system state. */ +static bool klp_is_state_compatible(struct klp_patch *patch, + struct klp_state *old_state) +{ + struct klp_state *state; + + state = klp_get_state(patch, old_state->id); + + /* A cumulative livepatch must handle all already modified states. */ + if (!state) + return !patch->replace; + + return state->version >= old_state->version; +} + +/* + * Check that the new livepatch will not break the existing system states. + * Cumulative patches must handle all already modified states. + * Non-cumulative patches can touch already modified states. + */ +bool klp_is_patch_compatible(struct klp_patch *patch) +{ + struct klp_patch *old_patch; + struct klp_state *old_state; + + klp_for_each_patch(old_patch) { + klp_for_each_state(old_patch, old_state) { + if (!klp_is_state_compatible(patch, old_state)) + return false; + } + } + + return true; +} diff --git a/kernel/livepatch/state.h b/kernel/livepatch/state.h new file mode 100644 index 000000000000..49d9c16e8762 --- /dev/null +++ b/kernel/livepatch/state.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LIVEPATCH_STATE_H +#define _LIVEPATCH_STATE_H + +#include + +bool klp_is_patch_compatible(struct klp_patch *patch); + +#endif /* _LIVEPATCH_STATE_H */ -- cgit v1.2.3 From 64fe8c061de7d7ffdfdc104a57d48d645ae0ac4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 1 Nov 2019 12:03:44 +0100 Subject: xsk: Store struct xdp_sock as a flexible array member of the XSKMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prior this commit, the array storing XDP socket instances were stored in a separate allocated array of the XSKMAP. Now, we store the sockets as a flexible array member in a similar fashion as the arraymap. Doing so, we do less pointer chasing in the lookup. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/20191101110346.15004-2-bjorn.topel@gmail.com --- kernel/bpf/xskmap.c | 55 ++++++++++++++++++++++------------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 82a1ffe15dfa..edcbd863650e 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -11,9 +11,9 @@ struct xsk_map { struct bpf_map map; - struct xdp_sock **xsk_map; struct list_head __percpu *flush_list; spinlock_t lock; /* Synchronize map updates */ + struct xdp_sock *xsk_map[]; }; int xsk_map_inc(struct xsk_map *map) @@ -80,9 +80,10 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { + struct bpf_map_memory mem; + int cpu, err, numa_node; struct xsk_map *m; - int cpu, err; - u64 cost; + u64 cost, size; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -92,44 +93,35 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) return ERR_PTR(-EINVAL); - m = kzalloc(sizeof(*m), GFP_USER); - if (!m) + numa_node = bpf_map_attr_numa_node(attr); + size = struct_size(m, xsk_map, attr->max_entries); + cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); + + err = bpf_map_charge_init(&mem, cost); + if (err < 0) + return ERR_PTR(err); + + m = bpf_map_area_alloc(size, numa_node); + if (!m) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } bpf_map_init_from_attr(&m->map, attr); + bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); - cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); - cost += sizeof(struct list_head) * num_possible_cpus(); - - /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_charge_init(&m->map.memory, cost); - if (err) - goto free_m; - - err = -ENOMEM; - m->flush_list = alloc_percpu(struct list_head); - if (!m->flush_list) - goto free_charge; + if (!m->flush_list) { + bpf_map_charge_finish(&m->map.memory); + bpf_map_area_free(m); + return ERR_PTR(-ENOMEM); + } for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); - m->xsk_map = bpf_map_area_alloc(m->map.max_entries * - sizeof(struct xdp_sock *), - m->map.numa_node); - if (!m->xsk_map) - goto free_percpu; return &m->map; - -free_percpu: - free_percpu(m->flush_list); -free_charge: - bpf_map_charge_finish(&m->map.memory); -free_m: - kfree(m); - return ERR_PTR(err); } static void xsk_map_free(struct bpf_map *map) @@ -139,8 +131,7 @@ static void xsk_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_net(); free_percpu(m->flush_list); - bpf_map_area_free(m->xsk_map); - kfree(m); + bpf_map_area_free(m); } static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -- cgit v1.2.3 From e65650f291ee72fc578d6346080fc4699204ef2c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 1 Nov 2019 12:03:45 +0100 Subject: bpf: Implement map_gen_lookup() callback for XSKMAP Inline the xsk_map_lookup_elem() via implementing the map_gen_lookup() callback. This results in emitting the bpf instructions in place of bpf_map_lookup_elem() helper call and better performance of bpf programs. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Link: https://lore.kernel.org/bpf/20191101110346.15004-3-bjorn.topel@gmail.com --- kernel/bpf/xskmap.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index edcbd863650e..554939f78b83 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -163,6 +163,22 @@ struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) return xs; } +static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; + struct bpf_insn *insn = insn_buf; + + *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); + *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); + *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(ret, 0); + return insn - insn_buf; +} + int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, struct xdp_sock *xs) { @@ -303,6 +319,7 @@ const struct bpf_map_ops xsk_map_ops = { .map_free = xsk_map_free, .map_get_next_key = xsk_map_get_next_key, .map_lookup_elem = xsk_map_lookup_elem, + .map_gen_lookup = xsk_map_gen_lookup, .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, -- cgit v1.2.3 From d817991cc7486ab83f6c7188b0bc80eebee872f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 1 Nov 2019 12:03:46 +0100 Subject: xsk: Restructure/inline XSKMAP lookup/redirect/flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit the XSKMAP entry lookup function used by the XDP redirect code is moved from the xskmap.c file to the xdp_sock.h header, so the lookup can be inlined from, e.g., the bpf_xdp_redirect_map() function. Further the __xsk_map_redirect() and __xsk_map_flush() is moved to the xsk.c, which lets the compiler inline the xsk_rcv() and xsk_flush() functions. Finally, all the XDP socket functions were moved from linux/bpf.h to net/xdp_sock.h, where most of the XDP sockets functions are anyway. This yields a ~2% performance boost for the xdpsock "rx_drop" scenario. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191101110346.15004-4-bjorn.topel@gmail.com --- kernel/bpf/xskmap.c | 48 ------------------------------------------------ 1 file changed, 48 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 554939f78b83..da16c30868f3 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -9,13 +9,6 @@ #include #include -struct xsk_map { - struct bpf_map map; - struct list_head __percpu *flush_list; - spinlock_t lock; /* Synchronize map updates */ - struct xdp_sock *xsk_map[]; -}; - int xsk_map_inc(struct xsk_map *map) { struct bpf_map *m = &map->map; @@ -151,18 +144,6 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs; - - if (key >= map->max_entries) - return NULL; - - xs = READ_ONCE(m->xsk_map[key]); - return xs; -} - static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; @@ -179,35 +160,6 @@ static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } -int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct list_head *flush_list = this_cpu_ptr(m->flush_list); - int err; - - err = xsk_rcv(xs, xdp); - if (err) - return err; - - if (!xs->flush_node.prev) - list_add(&xs->flush_node, flush_list); - - return 0; -} - -void __xsk_map_flush(struct bpf_map *map) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct list_head *flush_list = this_cpu_ptr(m->flush_list); - struct xdp_sock *xs, *tmp; - - list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { - xsk_flush(xs); - __list_del_clearprev(&xs->flush_node); - } -} - static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { WARN_ON_ONCE(!rcu_read_lock_held()); -- cgit v1.2.3 From eb1b66887472eaa7342305b7890ae510dd9d1a79 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:58 +0100 Subject: bpf: Make use of probe_user_write in probe write helper Convert the bpf_probe_write_user() helper to probe_user_write() such that writes are not attempted under KERNEL_DS anymore which is buggy as kernel and user space pointers can have overlapping addresses. Also, given we have the access_ok() check inside probe_user_write(), the helper doesn't need to do it twice. Fixes: 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/841c461781874c07a0ee404a454c3bc0459eed30.1572649915.git.daniel@iogearbox.net --- kernel/trace/bpf_trace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f50bf19f7a05..2d87fcdcb19b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -163,7 +163,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, +BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, u32, size) { /* @@ -186,10 +186,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; - if (!access_ok(unsafe_ptr, size)) - return -EPERM; - return probe_kernel_write(unsafe_ptr, src, size); + return probe_user_write(unsafe_ptr, src, size); } static const struct bpf_func_proto bpf_probe_write_user_proto = { -- cgit v1.2.3 From 6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:59 +0100 Subject: bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers The current bpf_probe_read() and bpf_probe_read_str() helpers are broken in that they assume they can be used for probing memory access for kernel space addresses /as well as/ user space addresses. However, plain use of probe_kernel_read() for both cases will attempt to always access kernel space address space given access is performed under KERNEL_DS and some archs in-fact have overlapping address spaces where a kernel pointer and user pointer would have the /same/ address value and therefore accessing application memory via bpf_probe_read{,_str}() would read garbage values. Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess: Add non-pagefault user-space read functions"). Unfortunately, the only way to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}() helpers are kept as-is to retain their current behavior. The two *_user() variants attempt the access always under USER_DS set, the two *_kernel() variants will -EFAULT when accessing user memory if the underlying architecture has non-overlapping address ranges, also avoiding throwing the kernel warning via 00c42373d397 ("x86-64: add warning for non-canonical user access address dereferences"). Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper") Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net --- kernel/trace/bpf_trace.c | 181 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 135 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2d87fcdcb19b..ffc91d4935ac 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -138,24 +138,140 @@ static const struct bpf_func_proto bpf_override_return_proto = { }; #endif -BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) +BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, + const void __user *, unsafe_ptr) { - int ret; + int ret = probe_user_read(dst, unsafe_ptr, size); - ret = security_locked_down(LOCKDOWN_BPF_READ); - if (ret < 0) - goto out; + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; +} + +static const struct bpf_func_proto bpf_probe_read_user_proto = { + .func = bpf_probe_read_user, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size); + + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; +} + +static const struct bpf_func_proto bpf_probe_read_user_str_proto = { + .func = bpf_probe_read_user_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; - ret = probe_kernel_read(dst, unsafe_ptr, size); +static __always_inline int +bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr, + const bool compat) +{ + int ret = security_locked_down(LOCKDOWN_BPF_READ); + + if (unlikely(ret < 0)) + goto out; + ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) : + probe_kernel_read_strict(dst, unsafe_ptr, size); if (unlikely(ret < 0)) out: memset(dst, 0, size); + return ret; +} + +BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); +} + +static const struct bpf_func_proto bpf_probe_read_kernel_proto = { + .func = bpf_probe_read_kernel, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true); +} +static const struct bpf_func_proto bpf_probe_read_compat_proto = { + .func = bpf_probe_read_compat, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static __always_inline int +bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr, + const bool compat) +{ + int ret = security_locked_down(LOCKDOWN_BPF_READ); + + if (unlikely(ret < 0)) + goto out; + /* + * The strncpy_from_unsafe_*() call will likely not fill the entire + * buffer, but that's okay in this circumstance as we're probing + * arbitrary memory anyway similar to bpf_probe_read_*() and might + * as well probe the stack. Thus, memory is explicitly cleared + * only in error case, so that improper users ignoring return + * code altogether don't copy garbage; otherwise length of string + * is returned that can be used for bpf_perf_event_output() et al. + */ + ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) : + strncpy_from_unsafe_strict(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) +out: + memset(dst, 0, size); return ret; } -static const struct bpf_func_proto bpf_probe_read_proto = { - .func = bpf_probe_read, +BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); +} + +static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { + .func = bpf_probe_read_kernel_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true); +} + +static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { + .func = bpf_probe_read_compat_str, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, @@ -583,41 +699,6 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, - const void *, unsafe_ptr) -{ - int ret; - - ret = security_locked_down(LOCKDOWN_BPF_READ); - if (ret < 0) - goto out; - - /* - * The strncpy_from_unsafe() call will likely not fill the entire - * buffer, but that's okay in this circumstance as we're probing - * arbitrary memory anyway similar to bpf_probe_read() and might - * as well probe the stack. Thus, memory is explicitly cleared - * only in error case, so that improper users ignoring return - * code altogether don't copy garbage; otherwise length of string - * is returned that can be used for bpf_perf_event_output() et al. - */ - ret = strncpy_from_unsafe(dst, unsafe_ptr, size); - if (unlikely(ret < 0)) -out: - memset(dst, 0, size); - - return ret; -} - -static const struct bpf_func_proto bpf_probe_read_str_proto = { - .func = bpf_probe_read_str, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_UNINIT_MEM, - .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_ANYTHING, -}; - struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; @@ -697,8 +778,6 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; - case BPF_FUNC_probe_read: - return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_tail_call: @@ -725,8 +804,18 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_read_user: + return &bpf_probe_read_user_proto; + case BPF_FUNC_probe_read_kernel: + return &bpf_probe_read_kernel_proto; + case BPF_FUNC_probe_read: + return &bpf_probe_read_compat_proto; + case BPF_FUNC_probe_read_user_str: + return &bpf_probe_read_user_str_proto; + case BPF_FUNC_probe_read_kernel_str: + return &bpf_probe_read_kernel_str_proto; case BPF_FUNC_probe_read_str: - return &bpf_probe_read_str_proto; + return &bpf_probe_read_compat_str_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; -- cgit v1.2.3 From 6e07a6341277a1dbdf5ed0c41921033c234c1635 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:18:00 +0100 Subject: bpf: Switch BPF probe insns to bpf_probe_read_kernel Commit 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter") explicitly states that the pointer to BTF object is a pointer to a kernel object or NULL. Therefore we should also switch to using the strict kernel probe helper which is restricted to kernel addresses only when architectures have non-overlapping address spaces. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/d2b90827837685424a4b8008dfe0460558abfada.1572649915.git.daniel@iogearbox.net --- kernel/bpf/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8d3fbc86ca5e..df82d5a42b23 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1306,11 +1306,12 @@ bool bpf_opcode_in_insntable(u8 code) } #ifndef CONFIG_BPF_JIT_ALWAYS_ON -u64 __weak bpf_probe_read(void * dst, u32 size, const void * unsafe_ptr) +u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) { memset(dst, 0, size); return -EFAULT; } + /** * __bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -1566,9 +1567,9 @@ out: LDST(W, u32) LDST(DW, u64) #undef LDST -#define LDX_PROBE(SIZEOP, SIZE) \ - LDX_PROBE_MEM_##SIZEOP: \ - bpf_probe_read(&DST, SIZE, (const void *)(long) SRC); \ +#define LDX_PROBE(SIZEOP, SIZE) \ + LDX_PROBE_MEM_##SIZEOP: \ + bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) SRC); \ CONT; LDX_PROBE(B, 1) LDX_PROBE(H, 2) -- cgit v1.2.3 From fbf6c73c5b264c25484fa9f449b5546569fe11f0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 16 Oct 2019 17:51:10 +0100 Subject: ftrace: add ftrace_init_nop() Architectures may need to perform special initialization of ftrace callsites, and today they do so by special-casing ftrace_make_nop() when the expected branch address is MCOUNT_ADDR. In some cases (e.g. for patchable-function-entry), we don't have an mcount-like symbol and don't want a synthetic MCOUNT_ADDR, but we may need to perform some initialization of callsites. To make it possible to separate initialization from runtime modification, and to handle cases without an mcount-like symbol, this patch adds an optional ftrace_init_nop() function that architectures can implement, which does not pass a branch address. Where an architecture does not provide ftrace_init_nop(), we will fall back to the existing behaviour of calling ftrace_make_nop() with MCOUNT_ADDR. At the same time, ftrace_code_disable() is renamed to ftrace_nop_initialize() to make it clearer that it is intended to intialize a callsite into a disabled state, and is not for disabling a callsite that has been runtime enabled. The kerneldoc description of rec arguments is updated to cover non-mcount callsites. Signed-off-by: Mark Rutland Reviewed-by: Amit Daniel Kachhap Reviewed-by: Ard Biesheuvel Reviewed-by: Miroslav Benes Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Torsten Duwe Tested-by: Amit Daniel Kachhap Tested-by: Sven Schnelle Tested-by: Torsten Duwe Cc: Ingo Molnar --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f296d89be757..5259d4dea675 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2494,14 +2494,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) } static int -ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) +ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec) { int ret; if (unlikely(ftrace_disabled)) return 0; - ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); + ret = ftrace_init_nop(mod, rec); if (ret) { ftrace_bug_type = FTRACE_BUG_INIT; ftrace_bug(ret, rec); @@ -2943,7 +2943,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) * to the NOP instructions. */ if (!__is_defined(CC_USING_NOP_MCOUNT) && - !ftrace_code_disable(mod, p)) + !ftrace_nop_initialize(mod, p)) break; update_cnt++; -- cgit v1.2.3 From a1326b17ac03a9012cb3d01e434aacb4d67a416c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 16 Oct 2019 18:17:11 +0100 Subject: module/ftrace: handle patchable-function-entry When using patchable-function-entry, the compiler will record the callsites into a section named "__patchable_function_entries" rather than "__mcount_loc". Let's abstract this difference behind a new FTRACE_CALLSITE_SECTION, so that architectures don't have to handle this explicitly (e.g. with custom module linker scripts). As parisc currently handles this explicitly, it is fixed up accordingly, with its custom linker script removed. Since FTRACE_CALLSITE_SECTION is only defined when DYNAMIC_FTRACE is selected, the parisc module loading code is updated to only use the definition in that case. When DYNAMIC_FTRACE is not selected, modules shouldn't have this section, so this removes some redundant work in that case. To make sure that this is keep up-to-date for modules and the main kernel, a comment is added to vmlinux.lds.h, with the existing ifdeffery simplified for legibility. I built parisc generic-{32,64}bit_defconfig with DYNAMIC_FTRACE enabled, and verified that the section made it into the .ko files for modules. Signed-off-by: Mark Rutland Acked-by: Helge Deller Acked-by: Steven Rostedt (VMware) Reviewed-by: Ard Biesheuvel Reviewed-by: Torsten Duwe Tested-by: Amit Daniel Kachhap Tested-by: Sven Schnelle Tested-by: Torsten Duwe Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: Jessica Yu Cc: linux-parisc@vger.kernel.org --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ff2d7359a418..acf7962936c4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3222,7 +3222,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ - mod->ftrace_callsites = section_objs(info, "__mcount_loc", + mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION, sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif -- cgit v1.2.3 From 1bb5ec2eec48dcab1d8ae3707e4a388da6a9c9dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 6 Nov 2019 12:49:57 -0800 Subject: cgroup: use cgroup->last_bstat instead of cgroup->bstat_pending for consistency cgroup->bstat_pending is used to determine the base stat delta to propagate to the parent. While correct, this is different from how percpu delta is determined for no good reason and the inconsistency makes the code more difficult to understand. This patch makes parent propagation delta calculation use the same method as percpu to global propagation. * cgroup_base_stat_accumulate() is renamed to cgroup_base_stat_add() and cgroup_base_stat_sub() is added. * percpu propagation calculation is updated to use the above helpers. * cgroup->bstat_pending is replaced with cgroup->last_bstat and updated to use the same calculation as percpu propagation. Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index ca19b4c8acf5..b48b22d4deb6 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -304,44 +304,48 @@ void __init cgroup_rstat_boot(void) * Functions for cgroup basic resource statistics implemented on top of * rstat. */ -static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, - struct cgroup_base_stat *src_bstat) +static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; } +static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) +{ + dst_bstat->cputime.utime -= src_bstat->cputime.utime; + dst_bstat->cputime.stime -= src_bstat->cputime.stime; + dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; +} + static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; - struct task_cputime cputime; - struct cgroup_base_stat delta; + struct cgroup_base_stat cur, delta; unsigned seq; /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); - cputime = rstatc->bstat.cputime; + cur.cputime = rstatc->bstat.cputime; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* calculate the delta to propgate */ - delta.cputime.utime = cputime.utime - last_cputime->utime; - delta.cputime.stime = cputime.stime - last_cputime->stime; - delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - - last_cputime->sum_exec_runtime; - *last_cputime = cputime; - - /* transfer the pending stat into delta */ - cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); - memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); - - /* propagate delta into the global stat and the parent's pending */ - cgroup_base_stat_accumulate(&cgrp->bstat, &delta); - if (parent) - cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); + /* propagate percpu delta to global */ + delta = cur; + cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + cgroup_base_stat_add(&cgrp->bstat, &delta); + cgroup_base_stat_add(&rstatc->last_bstat, &delta); + + /* propagate global delta to parent */ + if (parent) { + delta = cgrp->bstat; + cgroup_base_stat_sub(&delta, &cgrp->last_bstat); + cgroup_base_stat_add(&parent->bstat, &delta); + cgroup_base_stat_add(&cgrp->last_bstat, &delta); + } } static struct cgroup_rstat_cpu * -- cgit v1.2.3 From d0fbb51dfaa612f960519b798387be436e8f83c5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 4 Nov 2019 12:15:36 +0300 Subject: bpf, offload: Unlock on error in bpf_offload_dev_create() We need to drop the bpf_devs_lock on error before returning. Fixes: 9fd7c5559165 ("bpf: offload: aggregate offloads per-device") Signed-off-by: Dan Carpenter Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20191104091536.GB31509@mwanda --- kernel/bpf/offload.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ba635209ae9a..5b9da0954a27 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -678,8 +678,10 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) down_write(&bpf_devs_lock); if (!offdevs_inited) { err = rhashtable_init(&offdevs, &offdevs_params); - if (err) + if (err) { + up_write(&bpf_devs_lock); return ERR_PTR(err); + } offdevs_inited = true; } up_write(&bpf_devs_lock); -- cgit v1.2.3 From 85d31dd07002315c0d1816543fdc392c8cc6b73a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 6 Nov 2019 17:46:40 -0800 Subject: bpf: Account for insn->off when doing bpf_probe_read_kernel In the bpf interpreter mode, bpf_probe_read_kernel is used to read from PTR_TO_BTF_ID's kernel object. It currently missed considering the insn->off. This patch fixes it. Fixes: 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191107014640.384083-1-kafai@fb.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 97e37d82a1cc..c1fde0303280 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1569,7 +1569,7 @@ out: #undef LDST #define LDX_PROBE(SIZEOP, SIZE) \ LDX_PROBE_MEM_##SIZEOP: \ - bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) SRC); \ + bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \ CONT; LDX_PROBE(B, 1) LDX_PROBE(H, 2) -- cgit v1.2.3 From 742e8cd3e1ba6f19cad6d912f8d469df5557d0fd Mon Sep 17 00:00:00 2001 From: Honglei Wang Date: Wed, 30 Oct 2019 16:18:10 +0800 Subject: cgroup: freezer: don't change task and cgroups status unnecessarily It's not necessary to adjust the task state and revisit the state of source and destination cgroups if the cgroups are not in freeze state and the task itself is not frozen. And in this scenario, it wakes up the task who's not supposed to be ready to run. Don't do the unnecessary task state adjustment can help stop waking up the task without a reason. Signed-off-by: Honglei Wang Acked-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/freezer.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 8cf010680678..3984dd6b8ddb 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -230,6 +230,15 @@ void cgroup_freezer_migrate_task(struct task_struct *task, if (task->flags & PF_KTHREAD) return; + /* + * It's not necessary to do changes if both of the src and dst cgroups + * are not freezing and task is not frozen. + */ + if (!test_bit(CGRP_FREEZE, &src->flags) && + !test_bit(CGRP_FREEZE, &dst->flags) && + !task->frozen) + return; + /* * Adjust counters of freezing and frozen tasks. * Note, that if the task is frozen, but the destination cgroup is not -- cgit v1.2.3 From 7e3617a72df32341fea6d226cd6bb21de40c558d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 7 Nov 2019 10:09:03 -0800 Subject: bpf: Add array support to btf_struct_access This patch adds array support to btf_struct_access(). It supports array of int, array of struct and multidimensional array. It also allows using u8[] as a scratch space. For example, it allows access the "char cb[48]" with size larger than the array's element "char". Another potential use case is "u64 icsk_ca_priv[]" in the tcp congestion control. btf_resolve_size() is added to resolve the size of any type. It will follow the modifier if there is any. Please see the function comment for details. This patch also adds the "off < moff" check at the beginning of the for loop. It is to reject cases when "off" is pointing to a "hole" in a struct. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191107180903.4097702-1-kafai@fb.com --- kernel/bpf/btf.c | 195 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 166 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 128d89601d73..4639c4ba9a9b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1036,6 +1036,82 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) return env->top_stack ? &env->stack[env->top_stack - 1] : NULL; } +/* Resolve the size of a passed-in "type" + * + * type: is an array (e.g. u32 array[x][y]) + * return type: type "u32[x][y]", i.e. BTF_KIND_ARRAY, + * *type_size: (x * y * sizeof(u32)). Hence, *type_size always + * corresponds to the return type. + * *elem_type: u32 + * *total_nelems: (x * y). Hence, individual elem size is + * (*type_size / *total_nelems) + * + * type: is not an array (e.g. const struct X) + * return type: type "struct X" + * *type_size: sizeof(struct X) + * *elem_type: same as return type ("struct X") + * *total_nelems: 1 + */ +static const struct btf_type * +btf_resolve_size(const struct btf *btf, const struct btf_type *type, + u32 *type_size, const struct btf_type **elem_type, + u32 *total_nelems) +{ + const struct btf_type *array_type = NULL; + const struct btf_array *array; + u32 i, size, nelems = 1; + + for (i = 0; i < MAX_RESOLVE_DEPTH; i++) { + switch (BTF_INFO_KIND(type->info)) { + /* type->size can be used */ + case BTF_KIND_INT: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + size = type->size; + goto resolved; + + case BTF_KIND_PTR: + size = sizeof(void *); + goto resolved; + + /* Modifiers */ + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + type = btf_type_by_id(btf, type->type); + break; + + case BTF_KIND_ARRAY: + if (!array_type) + array_type = type; + array = btf_type_array(type); + if (nelems && array->nelems > U32_MAX / nelems) + return ERR_PTR(-EINVAL); + nelems *= array->nelems; + type = btf_type_by_id(btf, array->type); + break; + + /* type without size */ + default: + return ERR_PTR(-EINVAL); + } + } + + return ERR_PTR(-EINVAL); + +resolved: + if (nelems && size > U32_MAX / nelems) + return ERR_PTR(-EINVAL); + + *type_size = nelems * size; + *total_nelems = nelems; + *elem_type = type; + + return array_type ? : type; +} + /* The input param "type_id" must point to a needs_resolve type */ static const struct btf_type *btf_type_id_resolve(const struct btf *btf, u32 *type_id) @@ -3494,10 +3570,10 @@ int btf_struct_access(struct bpf_verifier_log *log, enum bpf_access_type atype, u32 *next_btf_id) { + u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; + const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; - const struct btf_type *mtype; const char *tname, *mname; - int i, moff = 0, msize; again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); @@ -3507,40 +3583,88 @@ again: } for_each_member(i, t, member) { - /* offset of the field in bits */ - moff = btf_member_bit_offset(t, member); - if (btf_member_bitfield_size(t, member)) /* bitfields are not supported yet */ continue; - if (off + size <= moff / 8) + /* offset of the field in bytes */ + moff = btf_member_bit_offset(t, member) / 8; + if (off + size <= moff) /* won't find anything, field is already too far */ break; + /* In case of "off" is pointing to holes of a struct */ + if (off < moff) + continue; /* type of the field */ mtype = btf_type_by_id(btf_vmlinux, member->type); mname = __btf_name_by_offset(btf_vmlinux, member->name_off); - /* skip modifiers */ - while (btf_type_is_modifier(mtype)) - mtype = btf_type_by_id(btf_vmlinux, mtype->type); - - if (btf_type_is_array(mtype)) - /* array deref is not supported yet */ - continue; - - if (!btf_type_has_size(mtype) && !btf_type_is_ptr(mtype)) { + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, + &elem_type, &total_nelems); + if (IS_ERR(mtype)) { bpf_log(log, "field %s doesn't have size\n", mname); return -EFAULT; } - if (btf_type_is_ptr(mtype)) - msize = 8; - else - msize = mtype->size; - if (off >= moff / 8 + msize) + + mtrue_end = moff + msize; + if (off >= mtrue_end) /* no overlap with member, keep iterating */ continue; + + if (btf_type_is_array(mtype)) { + u32 elem_idx; + + /* btf_resolve_size() above helps to + * linearize a multi-dimensional array. + * + * The logic here is treating an array + * in a struct as the following way: + * + * struct outer { + * struct inner array[2][2]; + * }; + * + * looks like: + * + * struct outer { + * struct inner array_elem0; + * struct inner array_elem1; + * struct inner array_elem2; + * struct inner array_elem3; + * }; + * + * When accessing outer->array[1][0], it moves + * moff to "array_elem2", set mtype to + * "struct inner", and msize also becomes + * sizeof(struct inner). Then most of the + * remaining logic will fall through without + * caring the current member is an array or + * not. + * + * Unlike mtype/msize/moff, mtrue_end does not + * change. The naming difference ("_true") tells + * that it is not always corresponding to + * the current mtype/msize/moff. + * It is the true end of the current + * member (i.e. array in this case). That + * will allow an int array to be accessed like + * a scratch space, + * i.e. allow access beyond the size of + * the array's element as long as it is + * within the mtrue_end boundary. + */ + + /* skip empty array */ + if (moff == mtrue_end) + continue; + + msize /= total_nelems; + elem_idx = (off - moff) / msize; + moff += elem_idx * msize; + mtype = elem_type; + } + /* the 'off' we're looking for is either equal to start * of this field or inside of this struct */ @@ -3549,20 +3673,20 @@ again: t = mtype; /* adjust offset we're looking for */ - off -= moff / 8; + off -= moff; goto again; } - if (msize != size) { - /* field access size doesn't match */ - bpf_log(log, - "cannot access %d bytes in struct %s field %s that has size %d\n", - size, tname, mname, msize); - return -EACCES; - } if (btf_type_is_ptr(mtype)) { const struct btf_type *stype; + if (msize != size || off != moff) { + bpf_log(log, + "cannot access ptr member %s with moff %u in struct %s with off %u size %u\n", + mname, moff, tname, off, size); + return -EACCES; + } + stype = btf_type_by_id(btf_vmlinux, mtype->type); /* skip modifiers */ while (btf_type_is_modifier(stype)) @@ -3572,7 +3696,20 @@ again: return PTR_TO_BTF_ID; } } - /* all other fields are treated as scalars */ + + /* Allow more flexible access within an int as long as + * it is within mtrue_end. + * Since mtrue_end could be the end of an array, + * that also allows using an array of int as a scratch + * space. e.g. skb->cb[]. + */ + if (off + size > mtrue_end) { + bpf_log(log, + "access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n", + mname, mtrue_end, tname, off, size); + return -EACCES; + } + return SCALAR_VALUE; } bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off); -- cgit v1.2.3 From 67c0496e87d193b8356d2af49ab95e8a1b954b3c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: convert kernfs_node->id from union kernfs_node_id to u64 kernfs_node->id is currently a union kernfs_node_id which represents either a 32bit (ino, gen) pair or u64 value. I can't see much value in the usage of the union - all that's needed is a 64bit ID which the current code is already limited to. Using a union makes the code unnecessarily complicated and prevents using 64bit ino without adding practical benefits. This patch drops union kernfs_node_id and makes kernfs_node->id a u64. ino is stored in the lower 32bits and gen upper. Accessors - kernfs[_id]_ino() and kernfs[_id]_gen() - are added to retrieve the ino and gen. This simplifies ID handling less cumbersome and will allow using 64bit inos on supported archs. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim Cc: Jens Axboe Cc: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- kernel/bpf/local_storage.c | 2 +- kernel/cgroup/cgroup.c | 3 +-- kernel/trace/blktrace.c | 67 +++++++++++++++++++++------------------------- 4 files changed, 34 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5e28718928ca..912e761cd17a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id.id; + return cgrp->kn->id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index addd6fdceec8..5d867f6d7204 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id.id; + storage->key.cgroup_inode_id = cgroup->kn->id; map = storage->map; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cf32c0c7a45d..c6bd1a5a1977 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5786,8 +5786,7 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2d6e93ab0478..a986d2e74ca2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -64,8 +64,7 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len, - union kernfs_node_id *cgid) + const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -73,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; @@ -100,8 +99,8 @@ record_it: t->pid = pid; t->cpu = cpu; t->pdu_len = len + cgid_len; - if (cgid) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) @@ -122,7 +121,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm), NULL); + sizeof(tsk->comm), 0); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,7 +138,7 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0); local_irq_restore(flags); } @@ -172,9 +171,9 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); + blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : 0); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } @@ -212,7 +211,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data, union kernfs_node_id *cgid) + void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -223,7 +222,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -294,7 +293,7 @@ record_it: t->pdu_len = pdu_len + cgid_len; if (cgid_len) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); if (pdu_len) memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); @@ -751,31 +750,29 @@ void blk_trace_shutdown(struct request_queue *q) } #ifdef CONFIG_BLK_CGROUP -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { struct blk_trace *bt = q->blk_trace; if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - return NULL; + return 0; if (!bio->bi_blkg) - return NULL; + return 0; return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - return NULL; + return 0; } #endif -static union kernfs_node_id * +static u64 blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) { if (!rq->bio) - return NULL; + return 0; /* Use the first bio */ return blk_trace_bio_get_cgid(q, rq->bio); } @@ -797,8 +794,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what, - union kernfs_node_id *cgid) + unsigned int nr_bytes, u32 what, u64 cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -913,7 +909,7 @@ static void blk_add_trace_getrq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL, NULL); + NULL, 0); } } @@ -929,7 +925,7 @@ static void blk_add_trace_sleeprq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL, NULL); + 0, 0, NULL, 0); } } @@ -938,7 +934,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -955,7 +951,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } } @@ -1172,19 +1168,17 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { - return (void *)(te_blk_io_trace(ent) + 1) + - (has_cg ? sizeof(union kernfs_node_id) : 0); + return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0); } -static inline const void *cgid_start(const struct trace_entry *ent) +static inline u64 t_cgid(const struct trace_entry *ent) { - return (void *)(te_blk_io_trace(ent) + 1); + return *(u64 *)(te_blk_io_trace(ent) + 1); } static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent)->pdu_len - - (has_cg ? sizeof(union kernfs_node_id) : 0); + return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1257,7 +1251,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, fill_rwbs(rwbs, t); if (has_cg) { - const union kernfs_node_id *id = cgid_start(iter->ent); + u64 id = t_cgid(iter->ent); if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { char blkcg_name_buf[NAME_MAX + 1] = "<...>"; @@ -1269,9 +1263,10 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, blkcg_name_buf, act, rwbs); } else trace_seq_printf(&iter->seq, - "%3d,%-3d %x,%-x %2s %3s ", + "%3d,%-3d %lx,%-x %2s %3s ", MAJOR(t->device), MINOR(t->device), - id->ino, id->generation, act, rwbs); + kernfs_id_ino(id), kernfs_id_gen(id), + act, rwbs); } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); -- cgit v1.2.3 From fe0f726c9fb626b1092a9ea3bf75f57f2eed676e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: combine ino/id lookup functions into kernfs_find_and_get_node_by_id() kernfs_find_and_get_node_by_ino() looks the kernfs_node matching the specified ino. On top of that, kernfs_get_node_by_id() and kernfs_fh_get_inode() implement full ID matching by testing the rest of ID. On surface, confusingly, the two are slightly different in that the latter uses 0 gen as wildcard while the former doesn't - does it mean that the latter can't uniquely identify inodes w/ 0 gen? In practice, this is a distinction without a difference because generation number starts at 1. There are no actual IDs with 0 gen, so it can always safely used as wildcard. Let's simplify the code by renaming kernfs_find_and_get_node_by_ino() to kernfs_find_and_get_node_by_id(), moving all lookup logics into it, and removing now unnecessary kernfs_get_node_by_id(). Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c6bd1a5a1977..b5dcbee5aa6c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5790,7 +5790,7 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; - kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return; kernfs_path(kn, buf, buflen); -- cgit v1.2.3 From 40430452fd5da1509177ac597b394614cd3a121f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: use 64bit inos if ino_t is 64bit Each kernfs_node is identified with a 64bit ID. The low 32bit is exposed as ino and the high gen. While this already allows using inos as keys by looking up with wildcard generation number of 0, it's adding unnecessary complications for 64bit ino archs which can directly use kernfs_node IDs as inos to uniquely identify each cgroup instance. This patch exposes IDs directly as inos on 64bit ino archs. The conversion is mostly straight-forward. * 32bit ino archs behave the same as before. 64bit ino archs now use the whole 64bit ID as ino and the generation number is fixed at 1. * 64bit inos still use the same idr allocator which gurantees that the lower 32bits identify the current live instance uniquely and the high 32bits are incremented whenever the low bits wrap. As the upper 32bits are no longer used as gen and we don't wanna start ino allocation with 33rd bit set, the initial value for highbits allocation is changed to 0 on 64bit ino archs. * blktrace exposes two 32bit numbers - (INO,GEN) pair - to identify the issuing cgroup. Userland builds FILEID_INO32_GEN fids from these numbers to look up the cgroups. To remain compatible with the behavior, always output (LOW32,HIGH32) which will be constructed back to the original 64bit ID by __kernfs_fh_to_dentry(). Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- kernel/trace/blktrace.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a986d2e74ca2..a7dac5b63f3f 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1261,12 +1261,25 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", MAJOR(t->device), MINOR(t->device), blkcg_name_buf, act, rwbs); - } else + } else { + /* + * The cgid portion used to be "INO,GEN". Userland + * builds a FILEID_INO32_GEN fid out of them and + * opens the cgroup using open_by_handle_at(2). + * While 32bit ino setups are still the same, 64bit + * ones now use the 64bit ino as the whole ID and + * no longer use generation. + * + * Regarldess of the content, always output + * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can + * be mapped back to @id on both 64 and 32bit ino + * setups. See __kernfs_fh_to_dentry(). + */ trace_seq_printf(&iter->seq, - "%3d,%-3d %lx,%-x %2s %3s ", + "%3d,%-3d %llx,%-llx %2s %3s ", MAJOR(t->device), MINOR(t->device), - kernfs_id_ino(id), kernfs_id_gen(id), - act, rwbs); + id & U32_MAX, id >> 32, act, rwbs); + } } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); -- cgit v1.2.3 From 743210386c0354a2f8ef3d697353c7d8477fa81d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: cgroup: use cgrp->kn->id as the cgroup ID cgroup ID is currently allocated using a dedicated per-hierarchy idr and used internally and exposed through tracepoints and bpf. This is confusing because there are tracepoints and other interfaces which use the cgroupfs ino as IDs. The preceding changes made kn->id exposed as ino as 64bit ino on supported archs or ino+gen (low 32bits as ino, high gen). There's no reason for cgroup to use different IDs. The kernfs IDs are unique and userland can easily discover them and map them back to paths using standard file operations. This patch replaces cgroup IDs with kernfs IDs. * cgroup_id() is added and all cgroup ID users are converted to use it. * kernfs_node creation is moved to earlier during cgroup init so that cgroup_id() is available during init. * While at it, s/cgroup/cgrp/ in psi helpers for consistency. * Fallback ID value is changed to 1 to be consistent with root cgroup ID. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- kernel/bpf/helpers.c | 2 +- kernel/bpf/local_storage.c | 2 +- kernel/cgroup/cgroup.c | 76 +++++++++++++++------------------------------- kernel/trace/blktrace.c | 4 +-- 4 files changed, 29 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 912e761cd17a..cada974c9f4e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id; + return cgroup_id(cgrp); } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 5d867f6d7204..2ba750725cb2 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id; + storage->key.cgroup_inode_id = cgroup_id(cgroup); map = storage->map; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b5dcbee5aa6c..c12dcf7dc432 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1308,10 +1308,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - if (root) { - idr_destroy(&root->cgroup_idr); - kfree(root); - } + kfree(root); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1917,7 +1914,6 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); - idr_init(&root->cgroup_idr); root->flags = ctx->flags; if (ctx->release_agent) @@ -1938,12 +1934,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) lockdep_assert_held(&cgroup_mutex); - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); - if (ret < 0) - goto out; - root_cgrp->id = ret; - root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -1976,6 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; + WARN_ON_ONCE(cgroup_id(root_cgrp) != 1); + root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); ret = css_populate_dir(&root_cgrp->self); if (ret) @@ -3552,22 +3544,22 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -4987,9 +4979,6 @@ static void css_release_work_fn(struct work_struct *work) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - @@ -5154,10 +5143,12 @@ err_free_css: * it isn't associated with its kernfs_node and doesn't have the control * mask applied. */ -static struct cgroup *cgroup_create(struct cgroup *parent) +static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, + umode_t mode) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; + struct kernfs_node *kn; int level = parent->level + 1; int ret; @@ -5177,15 +5168,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent) goto out_cancel_ref; } - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); - if (cgrp->id < 0) { - ret = -ENOMEM; + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); goto out_stat_exit; } + cgrp->kn = kn; init_cgroup_housekeeping(cgrp); @@ -5195,7 +5184,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ret = psi_cgroup_alloc(cgrp); if (ret) - goto out_idr_free; + goto out_kernfs_remove; ret = cgroup_bpf_inherit(cgrp); if (ret) @@ -5219,7 +5208,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; + cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); if (tcgrp != cgrp) { tcgrp->nr_descendants++; @@ -5248,12 +5237,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent) atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); - /* - * @cgrp is now fully operational. If something fails after this - * point, it'll be released via the normal destruction path. - */ - cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. @@ -5267,8 +5250,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) out_psi_free: psi_cgroup_free(cgrp); -out_idr_free: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_kernfs_remove: + kernfs_remove(cgrp->kn); out_stat_exit: if (cgroup_on_dfl(parent)) cgroup_rstat_exit(cgrp); @@ -5305,7 +5288,6 @@ fail: int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; - struct kernfs_node *kn; int ret; /* do not accept '\n' to prevent making /proc//cgroup unparsable */ @@ -5321,27 +5303,19 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) goto out_unlock; } - cgrp = cgroup_create(parent); + cgrp = cgroup_create(parent, name, mode); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_destroy; - } - cgrp->kn = kn; - /* * This extra ref will be put in cgroup_free_fn() and guarantees * that @cgrp->kn is always accessible. */ - kernfs_get(kn); + kernfs_get(cgrp->kn); - ret = cgroup_kn_set_ugid(kn); + ret = cgroup_kn_set_ugid(cgrp->kn); if (ret) goto out_destroy; @@ -5356,7 +5330,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ - kernfs_activate(kn); + kernfs_activate(cgrp->kn); ret = 0; goto out_unlock; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a7dac5b63f3f..475e29498bca 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -171,7 +171,7 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : 0); + blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif @@ -759,7 +759,7 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bio->bi_blkg) return 0; - return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); + return cgroup_id(bio_blkcg(bio)->css.cgroup); } #else u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) -- cgit v1.2.3 From d7495343228f30d8206e92dccfd1c41adcfa142d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Nov 2019 14:46:51 -0800 Subject: cgroup: fix incorrect WARN_ON_ONCE() in cgroup_setup_root() 743210386c03 ("cgroup: use cgrp->kn->id as the cgroup ID") added WARN which triggers if cgroup_id(root_cgrp) is not 1. This is fine on 64bit ino archs but on 32bit archs cgroup ID is ((gen << 32) | ino) and gen starts at 1, so the root id is 0x1_0000_0001 instead of 1 always triggering the WARN. What we wanna make sure is that the ino part is 1. Fix it. Reported-by: Naresh Kamboju Fixes: 743210386c03 ("cgroup: use cgrp->kn->id as the cgroup ID") Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c12dcf7dc432..53098c1d45e2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1966,7 +1966,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; - WARN_ON_ONCE(cgroup_id(root_cgrp) != 1); + WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); ret = css_populate_dir(&root_cgrp->self); -- cgit v1.2.3 From 3ca270fc9edb258d5bfa271bcf851614e9e6e7d4 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sun, 27 Oct 2019 18:52:38 +0800 Subject: perf/core: Provide a kernel-internal interface to recalibrate event period Currently, perf_event_period() is used by user tools via ioctl. Based on naming convention, exporting perf_event_period() for kernel users (such as KVM) who may recalibrate the event period for their assigned counter according to their requirements. The perf_event_period() is an external accessor, just like the perf_event_{en,dis}able() and should thus use perf_event_ctx_lock(). Suggested-by: Kan Liang Signed-off-by: Like Xu Acked-by: Peter Zijlstra Signed-off-by: Paolo Bonzini --- kernel/events/core.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9ec0b0bfddbd..e1b83d2731da 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5106,16 +5106,11 @@ static int perf_event_check_period(struct perf_event *event, u64 value) return event->pmu->check_period(event, value); } -static int perf_event_period(struct perf_event *event, u64 __user *arg) +static int _perf_event_period(struct perf_event *event, u64 value) { - u64 value; - if (!is_sampling_event(event)) return -EINVAL; - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - if (!value) return -EINVAL; @@ -5133,6 +5128,19 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) return 0; } +int perf_event_period(struct perf_event *event, u64 value) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_event_period(event, value); + perf_event_ctx_unlock(event, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(perf_event_period); + static const struct file_operations perf_fops; static inline int perf_fget_light(int fd, struct fd *p) @@ -5176,8 +5184,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: - return perf_event_period(event, (u64 __user *)arg); + { + u64 value; + + if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) + return -EFAULT; + return _perf_event_period(event, value); + } case PERF_EVENT_IOC_ID: { u64 id = primary_event_id(event); -- cgit v1.2.3 From 52ba4b0b99770e892f43da1238f437155acb8b58 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sun, 27 Oct 2019 18:52:39 +0800 Subject: perf/core: Provide a kernel-internal interface to pause perf_event Exporting perf_event_pause() as an external accessor for kernel users (such as KVM) who may do both disable perf_event and read count with just one time to hold perf_event_ctx_lock. Also the value could be reset optionally. Suggested-by: Peter Zijlstra Signed-off-by: Like Xu Acked-by: Peter Zijlstra Signed-off-by: Paolo Bonzini --- kernel/events/core.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e1b83d2731da..fc9f5ebf4849 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5029,6 +5029,24 @@ static void _perf_event_reset(struct perf_event *event) perf_event_update_userpage(event); } +/* Assume it's not an event with inherit set. */ +u64 perf_event_pause(struct perf_event *event, bool reset) +{ + struct perf_event_context *ctx; + u64 count; + + ctx = perf_event_ctx_lock(event); + WARN_ON_ONCE(event->attr.inherit); + _perf_event_disable(event); + count = local64_read(&event->count); + if (reset) + local64_set(&event->count, 0); + perf_event_ctx_unlock(event, ctx); + + return count; +} +EXPORT_SYMBOL_GPL(perf_event_pause); + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block -- cgit v1.2.3 From 49e9d1a9faf2f71fdfd80a30697ee9a15070626d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 15 Nov 2019 19:01:25 +0100 Subject: workqueue: Add RCU annotation for pwq list walk An additional check has been recently added to ensure that a RCU related lock is held while the RCU list is iterated. The `pwqs' are sometimes iterated without a RCU lock but with the &wq->mutex acquired leading to a warning. Teach list_for_each_entry_rcu() that the RCU usage is okay if &wq->mutex is acquired during the list traversal. Fixes: 28875945ba98d ("rcu: Add support for consolidated-RCU reader checking") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4dc8270326d7..914b845ad4ff 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -426,7 +426,8 @@ static void show_pwq(struct pool_workqueue *pwq); * ignored. */ #define for_each_pwq(pwq, wq) \ - list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ + list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ + lockdep_is_held(&wq->mutex)) \ if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ else -- cgit v1.2.3 From b7b3fc8dd95bc02bd30680da258e09dda55270db Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 15 Nov 2019 13:37:22 +0100 Subject: bpf: Support doubleword alignment in bpf_jit_binary_alloc Currently passing alignment greater than 4 to bpf_jit_binary_alloc does not work: in such cases it silently aligns only to 4 bytes. On s390, in order to load a constant from memory in a large (>512k) BPF program, one must use lgrl instruction, whose memory operand must be aligned on an 8-byte boundary. This patch makes it possible to request 8-byte alignment from bpf_jit_binary_alloc, and also makes it issue a warning when an unsupported alignment is requested. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191115123722.58462-1-iii@linux.ibm.com --- kernel/bpf/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1fde0303280..99693f3c4e99 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -31,6 +31,7 @@ #include #include #include +#include #include /* Registers */ @@ -815,6 +816,9 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, struct bpf_binary_header *hdr; u32 size, hole, start, pages; + WARN_ON_ONCE(!is_power_of_2(alignment) || + alignment > BPF_IMAGE_ALIGNMENT); + /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. -- cgit v1.2.3 From 5964b2000f283ff5df366f718e0f083ebbaae977 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:03 -0800 Subject: bpf: Add bpf_arch_text_poke() helper Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch nops/calls in kernel text into calls into BPF trampoline and to patch calls/nops inside BPF programs too. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-4-ast@kernel.org --- kernel/bpf/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 99693f3c4e99..434a0d920153 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2144,6 +2144,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } +int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *addr1, void *addr2) +{ + return -ENOTSUPP; +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); -- cgit v1.2.3 From fec56f5890d93fc2ed74166c397dc186b1c25951 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:04 -0800 Subject: bpf: Introduce BPF trampoline Introduce BPF trampoline concept to allow kernel code to call into BPF programs with practically zero overhead. The trampoline generation logic is architecture dependent. It's converting native calling convention into BPF calling convention. BPF ISA is 64-bit (even on 32-bit architectures). The registers R1 to R5 are used to pass arguments into BPF functions. The main BPF program accepts only single argument "ctx" in R1. Whereas CPU native calling convention is different. x86-64 is passing first 6 arguments in registers and the rest on the stack. x86-32 is passing first 3 arguments in registers. sparc64 is passing first 6 in registers. And so on. The trampolines between BPF and kernel already exist. BPF_CALL_x macros in include/linux/filter.h statically compile trampolines from BPF into kernel helpers. They convert up to five u64 arguments into kernel C pointers and integers. On 64-bit architectures this BPF_to_kernel trampolines are nops. On 32-bit architecture they're meaningful. The opposite job kernel_to_BPF trampolines is done by CAST_TO_U64 macros and __bpf_trace_##call() shim functions in include/trace/bpf_probe.h. They convert kernel function arguments into array of u64s that BPF program consumes via R1=ctx pointer. This patch set is doing the same job as __bpf_trace_##call() static trampolines, but dynamically for any kernel function. There are ~22k global kernel functions that are attachable via nop at function entry. The function arguments and types are described in BTF. The job of btf_distill_func_proto() function is to extract useful information from BTF into "function model" that architecture dependent trampoline generators will use to generate assembly code to cast kernel function arguments into array of u64s. For example the kernel function eth_type_trans has two pointers. They will be casted to u64 and stored into stack of generated trampoline. The pointer to that stack space will be passed into BPF program in R1. On x86-64 such generated trampoline will consume 16 bytes of stack and two stores of %rdi and %rsi into stack. The verifier will make sure that only two u64 are accessed read-only by BPF program. The verifier will also recognize the precise type of the pointers being accessed and will not allow typecasting of the pointer to a different type within BPF program. The tracing use case in the datacenter demonstrated that certain key kernel functions have (like tcp_retransmit_skb) have 2 or more kprobes that are always active. Other functions have both kprobe and kretprobe. So it is essential to keep both kernel code and BPF programs executing at maximum speed. Hence generated BPF trampoline is re-generated every time new program is attached or detached to maintain maximum performance. To avoid the high cost of retpoline the attached BPF programs are called directly. __bpf_prog_enter/exit() are used to support per-program execution stats. In the future this logic will be optimized further by adding support for bpf_stats_enabled_key inside generated assembly code. Introduction of preemptible and sleepable BPF programs will completely remove the need to call to __bpf_prog_enter/exit(). Detach of a BPF program from the trampoline should not fail. To avoid memory allocation in detach path the half of the page is used as a reserve and flipped after each attach/detach. 2k bytes is enough to call 40+ BPF programs directly which is enough for BPF tracing use cases. This limit can be increased in the future. BPF_TRACE_FENTRY programs have access to raw kernel function arguments while BPF_TRACE_FEXIT programs have access to kernel return value as well. Often kprobe BPF program remembers function arguments in a map while kretprobe fetches arguments from a map and analyzes them together with return value. BPF_TRACE_FEXIT accelerates this typical use case. Recursion prevention for kprobe BPF programs is done via per-cpu bpf_prog_active counter. In practice that turned out to be a mistake. It caused programs to randomly skip execution. The tracing tools missed results they were looking for. Hence BPF trampoline doesn't provide builtin recursion prevention. It's a job of BPF program itself and will be addressed in the follow up patches. BPF trampoline is intended to be used beyond tracing and fentry/fexit use cases in the future. For example to remove retpoline cost from XDP programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-5-ast@kernel.org --- kernel/bpf/Makefile | 1 + kernel/bpf/btf.c | 77 ++++++++++++++- kernel/bpf/core.c | 1 + kernel/bpf/syscall.c | 53 +++++++++- kernel/bpf/trampoline.c | 253 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 42 ++++++++ 6 files changed, 419 insertions(+), 8 deletions(-) create mode 100644 kernel/bpf/trampoline.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1d9adb212f9..3f671bf617e8 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o +obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4639c4ba9a9b..9e1164e5b429 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3517,13 +3517,18 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, args++; nr_args--; } - if (arg >= nr_args) { + + if (prog->expected_attach_type == BPF_TRACE_FEXIT && + arg == nr_args) { + /* function return type */ + t = btf_type_by_id(btf_vmlinux, t->type); + } else if (arg >= nr_args) { bpf_log(log, "func '%s' doesn't have %d-th argument\n", - tname, arg); + tname, arg + 1); return false; + } else { + t = btf_type_by_id(btf_vmlinux, args[arg].type); } - - t = btf_type_by_id(btf_vmlinux, args[arg].type); /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf_vmlinux, t->type); @@ -3784,6 +3789,70 @@ u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) return btf_id; } +static int __get_type_size(struct btf *btf, u32 btf_id, + const struct btf_type **bad_type) +{ + const struct btf_type *t; + + if (!btf_id) + /* void */ + return 0; + t = btf_type_by_id(btf, btf_id); + while (t && btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (!t) + return -EINVAL; + if (btf_type_is_ptr(t)) + /* kernel size of pointer. Not BPF's size of pointer*/ + return sizeof(void *); + if (btf_type_is_int(t) || btf_type_is_enum(t)) + return t->size; + *bad_type = t; + return -EINVAL; +} + +int btf_distill_func_proto(struct bpf_verifier_log *log, + struct btf *btf, + const struct btf_type *func, + const char *tname, + struct btf_func_model *m) +{ + const struct btf_param *args; + const struct btf_type *t; + u32 i, nargs; + int ret; + + args = (const struct btf_param *)(func + 1); + nargs = btf_type_vlen(func); + if (nargs >= MAX_BPF_FUNC_ARGS) { + bpf_log(log, + "The function %s has %d arguments. Too many.\n", + tname, nargs); + return -EINVAL; + } + ret = __get_type_size(btf, func->type, &t); + if (ret < 0) { + bpf_log(log, + "The function %s return type %s is unsupported.\n", + tname, btf_kind_str[BTF_INFO_KIND(t->info)]); + return -EINVAL; + } + m->ret_size = ret; + + for (i = 0; i < nargs; i++) { + ret = __get_type_size(btf, args[i].type, &t); + if (ret < 0) { + bpf_log(log, + "The function %s arg%d type %s is unsupported.\n", + tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); + return -EINVAL; + } + m->arg_size[i] = ret; + } + m->nr_args = nargs; + return 0; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 434a0d920153..da5a8b8e278f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2015,6 +2015,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) if (aux->prog->has_callchain_buf) put_callchain_buffers(); #endif + bpf_trampoline_put(aux->trampoline); for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6d9ce95e5a8d..e2e37bea86bc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1799,6 +1799,49 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } +static int bpf_tracing_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_tracing_prog_fops = { + .release = bpf_tracing_prog_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, +}; + +static int bpf_tracing_prog_attach(struct bpf_prog *prog) +{ + int tr_fd, err; + + if (prog->expected_attach_type != BPF_TRACE_FENTRY && + prog->expected_attach_type != BPF_TRACE_FEXIT) { + err = -EINVAL; + goto out_put_prog; + } + + err = bpf_trampoline_link_prog(prog); + if (err) + goto out_put_prog; + + tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops, + prog, O_CLOEXEC); + if (tr_fd < 0) { + WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog)); + err = tr_fd; + goto out_put_prog; + } + return tr_fd; + +out_put_prog: + bpf_prog_put(prog); + return err; +} + struct bpf_raw_tracepoint { struct bpf_raw_event_map *btp; struct bpf_prog *prog; @@ -1850,14 +1893,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (prog->type == BPF_PROG_TYPE_TRACING) { if (attr->raw_tracepoint.name) { - /* raw_tp name should not be specified in raw_tp - * programs that were verified via in-kernel BTF info + /* The attach point for this category of programs + * should be specified via btf_id during program load. */ err = -EINVAL; goto out_put_prog; } - /* raw_tp name is taken from type name instead */ - tp_name = prog->aux->attach_func_name; + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) + tp_name = prog->aux->attach_func_name; + else + return bpf_tracing_prog_attach(prog); } else { if (strncpy_from_user(buf, u64_to_user_ptr(attr->raw_tracepoint.name), diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c new file mode 100644 index 000000000000..10ae59d65f13 --- /dev/null +++ b/kernel/bpf/trampoline.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2019 Facebook */ +#include +#include +#include + +/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ +#define TRAMPOLINE_HASH_BITS 10 +#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) + +static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; + +/* serializes access to trampoline_table */ +static DEFINE_MUTEX(trampoline_mutex); + +struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +{ + struct bpf_trampoline *tr; + struct hlist_head *head; + void *image; + int i; + + mutex_lock(&trampoline_mutex); + head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; + hlist_for_each_entry(tr, head, hlist) { + if (tr->key == key) { + refcount_inc(&tr->refcnt); + goto out; + } + } + tr = kzalloc(sizeof(*tr), GFP_KERNEL); + if (!tr) + goto out; + + /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!image) { + kfree(tr); + tr = NULL; + goto out; + } + + tr->key = key; + INIT_HLIST_NODE(&tr->hlist); + hlist_add_head(&tr->hlist, head); + refcount_set(&tr->refcnt, 1); + mutex_init(&tr->mutex); + for (i = 0; i < BPF_TRAMP_MAX; i++) + INIT_HLIST_HEAD(&tr->progs_hlist[i]); + + set_vm_flush_reset_perms(image); + /* Keep image as writeable. The alternative is to keep flipping ro/rw + * everytime new program is attached or detached. + */ + set_memory_x((long)image, 1); + tr->image = image; +out: + mutex_unlock(&trampoline_mutex); + return tr; +} + +/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 + * bytes on x86. Pick a number to fit into PAGE_SIZE / 2 + */ +#define BPF_MAX_TRAMP_PROGS 40 + +static int bpf_trampoline_update(struct bpf_trampoline *tr) +{ + void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; + void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS]; + int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY]; + int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT]; + struct bpf_prog **progs, **fentry, **fexit; + u32 flags = BPF_TRAMP_F_RESTORE_REGS; + struct bpf_prog_aux *aux; + int err; + + if (fentry_cnt + fexit_cnt == 0) { + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_NOP, + old_image, NULL); + tr->selector = 0; + goto out; + } + + /* populate fentry progs */ + fentry = progs = progs_to_run; + hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist) + *progs++ = aux->prog; + + /* populate fexit progs */ + fexit = progs; + hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist) + *progs++ = aux->prog; + + if (fexit_cnt) + flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + + err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags, + fentry, fentry_cnt, + fexit, fexit_cnt, + tr->func.addr); + if (err) + goto out; + + if (tr->selector) + /* progs already running at this address */ + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_CALL, + old_image, new_image); + else + /* first time registering */ + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP_TO_CALL, + NULL, new_image); + if (err) + goto out; + tr->selector++; +out: + return err; +} + +static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t) +{ + switch (t) { + case BPF_TRACE_FENTRY: + return BPF_TRAMP_FENTRY; + default: + return BPF_TRAMP_FEXIT; + } +} + +int bpf_trampoline_link_prog(struct bpf_prog *prog) +{ + enum bpf_tramp_prog_type kind; + struct bpf_trampoline *tr; + int err = 0; + + tr = prog->aux->trampoline; + kind = bpf_attach_type_to_tramp(prog->expected_attach_type); + mutex_lock(&tr->mutex); + if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT] + >= BPF_MAX_TRAMP_PROGS) { + err = -E2BIG; + goto out; + } + if (!hlist_unhashed(&prog->aux->tramp_hlist)) { + /* prog already linked */ + err = -EBUSY; + goto out; + } + hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); + tr->progs_cnt[kind]++; + err = bpf_trampoline_update(prog->aux->trampoline); + if (err) { + hlist_del(&prog->aux->tramp_hlist); + tr->progs_cnt[kind]--; + } +out: + mutex_unlock(&tr->mutex); + return err; +} + +/* bpf_trampoline_unlink_prog() should never fail. */ +int bpf_trampoline_unlink_prog(struct bpf_prog *prog) +{ + enum bpf_tramp_prog_type kind; + struct bpf_trampoline *tr; + int err; + + tr = prog->aux->trampoline; + kind = bpf_attach_type_to_tramp(prog->expected_attach_type); + mutex_lock(&tr->mutex); + hlist_del(&prog->aux->tramp_hlist); + tr->progs_cnt[kind]--; + err = bpf_trampoline_update(prog->aux->trampoline); + mutex_unlock(&tr->mutex); + return err; +} + +void bpf_trampoline_put(struct bpf_trampoline *tr) +{ + if (!tr) + return; + mutex_lock(&trampoline_mutex); + if (!refcount_dec_and_test(&tr->refcnt)) + goto out; + WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY]))) + goto out; + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) + goto out; + bpf_jit_free_exec(tr->image); + hlist_del(&tr->hlist); + kfree(tr); +out: + mutex_unlock(&trampoline_mutex); +} + +/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that + * are needed for trampoline. The macro is split into + * call _bpf_prog_enter + * call prog->bpf_func + * call __bpf_prog_exit + */ +u64 notrace __bpf_prog_enter(void) +{ + u64 start = 0; + + rcu_read_lock(); + preempt_disable(); + if (static_branch_unlikely(&bpf_stats_enabled_key)) + start = sched_clock(); + return start; +} + +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) +{ + struct bpf_prog_stats *stats; + + if (static_branch_unlikely(&bpf_stats_enabled_key) && + /* static_key could be enabled in __bpf_prog_enter + * and disabled in __bpf_prog_exit. + * And vice versa. + * Hence check that 'start' is not zero. + */ + start) { + stats = this_cpu_ptr(prog->aux->stats); + u64_stats_update_begin(&stats->syncp); + stats->cnt++; + stats->nsecs += sched_clock() - start; + u64_stats_update_end(&stats->syncp); + } + preempt_enable(); + rcu_read_unlock(); +} + +int __weak +arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, + struct bpf_prog **fentry_progs, int fentry_cnt, + struct bpf_prog **fexit_progs, int fexit_cnt, + void *orig_call) +{ + return -ENOTSUPP; +} + +static int __init init_trampolines(void) +{ + int i; + + for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&trampoline_table[i]); + return 0; +} +late_initcall(init_trampolines); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2f2374967b36..8f89cfa93e88 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9382,8 +9382,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *prog = env->prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + struct bpf_trampoline *tr; const struct btf_type *t; const char *tname; + int ret = 0; + long addr; if (prog->type != BPF_PROG_TYPE_TRACING) return 0; @@ -9432,6 +9435,45 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + if (!btf_type_is_func(t)) { + verbose(env, "attach_btf_id %u is not a function\n", + btf_id); + return -EINVAL; + } + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + tr = bpf_trampoline_lookup(btf_id); + if (!tr) + return -ENOMEM; + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + mutex_lock(&tr->mutex); + if (tr->func.addr) { + prog->aux->trampoline = tr; + goto out; + } + ret = btf_distill_func_proto(&env->log, btf_vmlinux, t, + tname, &tr->func.model); + if (ret < 0) + goto out; + addr = kallsyms_lookup_name(tname); + if (!addr) { + verbose(env, + "The address of function %s cannot be found\n", + tname); + ret = -ENOENT; + goto out; + } + tr->func.addr = (void *)addr; + prog->aux->trampoline = tr; +out: + mutex_unlock(&tr->mutex); + if (ret) + bpf_trampoline_put(tr); + return ret; default: return -EINVAL; } -- cgit v1.2.3 From 9cc31b3a092d9bf2a18f09ad77e727ddb42a5b1e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:14 -0800 Subject: bpf: Fix race in btf_resolve_helper_id() btf_resolve_helper_id() caching logic is a bit racy, since under root the verifier can verify several programs in parallel. Fix it with READ/WRITE_ONCE. Fix the type as well, since error is also recorded. Fixes: a7658e1a4164 ("bpf: Check types of arguments passed into helpers") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-15-ast@kernel.org --- kernel/bpf/btf.c | 26 +++++++++++++++++++++++++- kernel/bpf/verifier.c | 8 +++----- 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9e1164e5b429..033d071eb59c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3721,7 +3721,8 @@ again: return -EINVAL; } -u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) +static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, + int arg) { char fnname[KSYM_SYMBOL_LEN + 4] = "btf_"; const struct btf_param *args; @@ -3789,6 +3790,29 @@ u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) return btf_id; } +int btf_resolve_helper_id(struct bpf_verifier_log *log, + const struct bpf_func_proto *fn, int arg) +{ + int *btf_id = &fn->btf_id[arg]; + int ret; + + if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID) + return -EINVAL; + + ret = READ_ONCE(*btf_id); + if (ret) + return ret; + /* ok to race the search. The result is the same */ + ret = __btf_resolve_helper_id(log, fn->func, arg); + if (!ret) { + /* Function argument cannot be type 'void' */ + bpf_log(log, "BTF resolution bug\n"); + return -EFAULT; + } + WRITE_ONCE(*btf_id, ret); + return ret; +} + static int __get_type_size(struct btf *btf, u32 btf_id, const struct btf_type **bad_type) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8f89cfa93e88..e78ec7990767 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4147,11 +4147,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID) { - if (!fn->btf_id[i]) - fn->btf_id[i] = btf_resolve_helper_id(&env->log, fn->func, i); - meta.btf_id = fn->btf_id[i]; - } + err = btf_resolve_helper_id(&env->log, fn, i); + if (err > 0) + meta.btf_id = err; err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); if (err) return err; -- cgit v1.2.3 From 91cc1a99740e2ed1d903b5906afb470cc5a07379 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:15 -0800 Subject: bpf: Annotate context types Annotate BPF program context types with program-side type and kernel-side type. This type information is used by the verifier. btf_get_prog_ctx_type() is used in the later patches to verify that BTF type of ctx in BPF program matches to kernel expected ctx type. For example, the XDP program type is: BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp, struct xdp_md, struct xdp_buff) That means that XDP program should be written as: int xdp_prog(struct xdp_md *ctx) { ... } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-16-ast@kernel.org --- kernel/bpf/btf.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 4 +- kernel/bpf/verifier.c | 2 +- 3 files changed, 114 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 033d071eb59c..4b7c8bd423d6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2,6 +2,8 @@ /* Copyright (c) 2018 Facebook */ #include +#include +#include #include #include #include @@ -16,6 +18,9 @@ #include #include #include +#include +#include +#include /* BTF (BPF Type Format) is the meta data format which describes * the data types of BPF program/map. Hence, it basically focus @@ -3439,13 +3444,98 @@ errout: extern char __weak _binary__btf_vmlinux_bin_start[]; extern char __weak _binary__btf_vmlinux_bin_end[]; +extern struct btf *btf_vmlinux; + +#define BPF_MAP_TYPE(_id, _ops) +static union { + struct bpf_ctx_convert { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ + prog_ctx_type _id##_prog; \ + kern_ctx_type _id##_kern; +#include +#undef BPF_PROG_TYPE + } *__t; + /* 't' is written once under lock. Read many times. */ + const struct btf_type *t; +} bpf_ctx_convert; +enum { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ + __ctx_convert##_id, +#include +#undef BPF_PROG_TYPE +}; +static u8 bpf_ctx_convert_map[] = { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ + [_id] = __ctx_convert##_id, +#include +#undef BPF_PROG_TYPE +}; +#undef BPF_MAP_TYPE + +static const struct btf_member * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type) +{ + const struct btf_type *conv_struct; + const struct btf_type *ctx_struct; + const struct btf_member *ctx_type; + const char *tname, *ctx_tname; + + conv_struct = bpf_ctx_convert.t; + if (!conv_struct) { + bpf_log(log, "btf_vmlinux is malformed\n"); + return NULL; + } + t = btf_type_by_id(btf, t->type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_struct(t)) { + /* Only pointer to struct is supported for now. + * That means that BPF_PROG_TYPE_TRACEPOINT with BTF + * is not supported yet. + * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. + */ + bpf_log(log, "BPF program ctx type is not a struct\n"); + return NULL; + } + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) { + bpf_log(log, "BPF program ctx struct doesn't have a name\n"); + return NULL; + } + /* prog_type is valid bpf program type. No need for bounds check. */ + ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; + /* ctx_struct is a pointer to prog_ctx_type in vmlinux. + * Like 'struct __sk_buff' + */ + ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type); + if (!ctx_struct) + /* should not happen */ + return NULL; + ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off); + if (!ctx_tname) { + /* should not happen */ + bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); + return NULL; + } + /* only compare that prog's ctx type name is the same as + * kernel expects. No need to compare field by field. + * It's ok for bpf prog to do: + * struct __sk_buff {}; + * int socket_filter_bpf_prog(struct __sk_buff *skb) + * { // no fields of skb are ever used } + */ + if (strcmp(ctx_tname, tname)) + return NULL; + return ctx_type; +} struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf = NULL; - int err; + int err, i; env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) @@ -3479,6 +3569,26 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; + /* find struct bpf_ctx_convert for type checking later */ + for (i = 1; i <= btf->nr_types; i++) { + const struct btf_type *t; + const char *tname; + + t = btf_type_by_id(btf, i); + if (!__btf_type_is_struct(t)) + continue; + tname = __btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, "bpf_ctx_convert")) { + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ + bpf_ctx_convert.t = t; + break; + } + } + if (i > btf->nr_types) { + err = -ENOENT; + goto errout; + } + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; @@ -3492,8 +3602,6 @@ errout: return ERR_PTR(err); } -extern struct btf *btf_vmlinux; - bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e2e37bea86bc..05a0ee75eca0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(map_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; static const struct bpf_map_ops * const bpf_map_types[] = { -#define BPF_PROG_TYPE(_id, _ops) +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, #include @@ -1189,7 +1189,7 @@ err_put: } static const struct bpf_prog_ops * const bpf_prog_types[] = { -#define BPF_PROG_TYPE(_id, _name) \ +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) #include diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e78ec7990767..7395d6bebefd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,7 +23,7 @@ #include "disasm.h" static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { -#define BPF_PROG_TYPE(_id, _name) \ +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) #include -- cgit v1.2.3 From 8c1b6e69dcc1e11bd24111e3734dd740aaf3fda1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:16 -0800 Subject: bpf: Compare BTF types of functions arguments with actual types Make the verifier check that BTF types of function arguments match actual types passed into top-level BPF program and into BPF-to-BPF calls. If types match such BPF programs and sub-programs will have full support of BPF trampoline. If types mismatch the trampoline has to be conservative. It has to save/restore five program arguments and assume 64-bit scalars. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-17-ast@kernel.org --- kernel/bpf/btf.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 1 + kernel/bpf/verifier.c | 18 +++++++++-- 3 files changed, 98 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4b7c8bd423d6..4620267b186e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3985,6 +3985,88 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return 0; } +int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog) +{ + struct bpf_verifier_state *st = env->cur_state; + struct bpf_func_state *func = st->frame[st->curframe]; + struct bpf_reg_state *reg = func->regs; + struct bpf_verifier_log *log = &env->log; + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + const struct btf_param *args; + const struct btf_type *t; + u32 i, nargs, btf_id; + const char *tname; + + if (!prog->aux->func_info) + return 0; + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) + return 0; + + if (prog->aux->func_info_aux[subprog].unreliable) + return 0; + + t = btf_type_by_id(btf, btf_id); + if (!t || !btf_type_is_func(t)) { + bpf_log(log, "BTF of subprog %d doesn't point to KIND_FUNC\n", + subprog); + return -EINVAL; + } + tname = btf_name_by_offset(btf, t->name_off); + + t = btf_type_by_id(btf, t->type); + if (!t || !btf_type_is_func_proto(t)) { + bpf_log(log, "Invalid type of func %s\n", tname); + return -EINVAL; + } + args = (const struct btf_param *)(t + 1); + nargs = btf_type_vlen(t); + if (nargs > 5) { + bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); + goto out; + } + /* check that BTF function arguments match actual types that the + * verifier sees. + */ + for (i = 0; i < nargs; i++) { + t = btf_type_by_id(btf, args[i].type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (btf_type_is_int(t) || btf_type_is_enum(t)) { + if (reg[i + 1].type == SCALAR_VALUE) + continue; + bpf_log(log, "R%d is not a scalar\n", i + 1); + goto out; + } + if (btf_type_is_ptr(t)) { + if (reg[i + 1].type == SCALAR_VALUE) { + bpf_log(log, "R%d is not a pointer\n", i + 1); + goto out; + } + /* If program is passing PTR_TO_CTX into subprogram + * check that BTF type matches. + */ + if (reg[i + 1].type == PTR_TO_CTX && + !btf_get_prog_ctx_type(log, btf, t, prog->type)) + goto out; + /* All other pointers are ok */ + continue; + } + bpf_log(log, "Unrecognized argument type %s\n", + btf_kind_str[BTF_INFO_KIND(t->info)]); + goto out; + } + return 0; +out: + /* LLVM optimizations can remove arguments from static functions. */ + bpf_log(log, + "Type info disagrees with actual arguments due to compiler optimizations\n"); + prog->aux->func_info_aux[subprog].unreliable = true; + return 0; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 05a0ee75eca0..43ba647de720 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1328,6 +1328,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); kvfree(aux->func_info); + kfree(aux->func_info_aux); free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); security_bpf_prog_free(aux); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7395d6bebefd..11910149ca2f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3970,6 +3970,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* only increment it after check_reg_arg() finished */ state->curframe++; + if (btf_check_func_arg_match(env, subprog)) + return -EINVAL; + /* and go analyze first insn of the callee */ *insn_idx = target_insn; @@ -6564,6 +6567,7 @@ static int check_btf_func(struct bpf_verifier_env *env, u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); struct bpf_func_info *krecord; + struct bpf_func_info_aux *info_aux = NULL; const struct btf_type *type; struct bpf_prog *prog; const struct btf *btf; @@ -6597,6 +6601,9 @@ static int check_btf_func(struct bpf_verifier_env *env, krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); if (!krecord) return -ENOMEM; + info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); + if (!info_aux) + goto err_free; for (i = 0; i < nfuncs; i++) { ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); @@ -6648,29 +6655,31 @@ static int check_btf_func(struct bpf_verifier_env *env, ret = -EINVAL; goto err_free; } - prev_offset = krecord[i].insn_off; urecord += urec_size; } prog->aux->func_info = krecord; prog->aux->func_info_cnt = nfuncs; + prog->aux->func_info_aux = info_aux; return 0; err_free: kvfree(krecord); + kfree(info_aux); return ret; } static void adjust_btf_func(struct bpf_verifier_env *env) { + struct bpf_prog_aux *aux = env->prog->aux; int i; - if (!env->prog->aux->func_info) + if (!aux->func_info) return; for (i = 0; i < env->subprog_cnt; i++) - env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start; + aux->func_info[i].insn_off = env->subprog_info[i].start; } #define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \ @@ -7651,6 +7660,9 @@ static int do_check(struct bpf_verifier_env *env) 0 /* frameno */, 0 /* subprogno, zero == main subprog */); + if (btf_check_func_arg_match(env, 0)) + return -EINVAL; + for (;;) { struct bpf_insn *insn; u8 class; -- cgit v1.2.3 From 5b92a28aae4dd0f88778d540ecfdcdaec5a41723 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:17 -0800 Subject: bpf: Support attaching tracing BPF program to other BPF programs Allow FENTRY/FEXIT BPF programs to attach to other BPF programs of any type including their subprograms. This feature allows snooping on input and output packets in XDP, TC programs including their return values. In order to do that the verifier needs to track types not only of vmlinux, but types of other BPF programs as well. The verifier also needs to translate uapi/linux/bpf.h types used by networking programs into kernel internal BTF types used by FENTRY/FEXIT BPF programs. In some cases LLVM optimizations can remove arguments from BPF subprograms without adjusting BTF info that LLVM backend knows. When BTF info disagrees with actual types that the verifiers sees the BPF trampoline has to fallback to conservative and treat all arguments as u64. The FENTRY/FEXIT program can still attach to such subprograms, but it won't be able to recognize pointer types like 'struct sk_buff *' and it won't be able to pass them to bpf_skb_output() for dumping packets to user space. The FENTRY/FEXIT program would need to use bpf_probe_read_kernel() instead. The BPF_PROG_LOAD command is extended with attach_prog_fd field. When it's set to zero the attach_btf_id is one vmlinux BTF type ids. When attach_prog_fd points to previously loaded BPF program the attach_btf_id is BTF type id of main function or one of its subprograms. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-18-ast@kernel.org --- kernel/bpf/btf.c | 70 ++++++++++++++++++++++++++++++++++++++----- kernel/bpf/core.c | 2 ++ kernel/bpf/syscall.c | 19 +++++++++--- kernel/bpf/verifier.c | 83 +++++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 147 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4620267b186e..40efde5eedcb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3530,6 +3530,20 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, return ctx_type; } +static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, + struct btf *btf, + const struct btf_type *t, + enum bpf_prog_type prog_type) +{ + const struct btf_member *prog_ctx_type, *kern_ctx_type; + + prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type); + if (!prog_ctx_type) + return -ENOENT; + kern_ctx_type = prog_ctx_type + 1; + return kern_ctx_type->type; +} + struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; @@ -3602,15 +3616,29 @@ errout: return ERR_PTR(err); } +struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) +{ + struct bpf_prog *tgt_prog = prog->aux->linked_prog; + + if (tgt_prog) { + return tgt_prog->aux->btf; + } else { + return btf_vmlinux; + } +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const struct btf_type *t = prog->aux->attach_func_proto; + struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct btf *btf = bpf_prog_get_target_btf(prog); const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; u32 nr_args, arg; + int ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", @@ -3619,7 +3647,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } arg = off / 8; args = (const struct btf_param *)(t + 1); - nr_args = btf_type_vlen(t); + /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */ + nr_args = t ? btf_type_vlen(t) : 5; if (prog->aux->attach_btf_trace) { /* skip first 'void *__data' argument in btf_trace_##name typedef */ args++; @@ -3628,18 +3657,24 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog->expected_attach_type == BPF_TRACE_FEXIT && arg == nr_args) { + if (!t) + /* Default prog with 5 args. 6th arg is retval. */ + return true; /* function return type */ - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); } else if (arg >= nr_args) { bpf_log(log, "func '%s' doesn't have %d-th argument\n", tname, arg + 1); return false; } else { - t = btf_type_by_id(btf_vmlinux, args[arg].type); + if (!t) + /* Default prog with 5 args */ + return true; + t = btf_type_by_id(btf, args[arg].type); } /* skip modifiers */ while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t)) /* accessing a scalar */ return true; @@ -3647,7 +3682,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, bpf_log(log, "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, - __btf_name_by_offset(btf_vmlinux, t->name_off), + __btf_name_by_offset(btf, t->name_off), btf_kind_str[BTF_INFO_KIND(t->info)]); return false; } @@ -3662,10 +3697,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = PTR_TO_BTF_ID; info->btf_id = t->type; - t = btf_type_by_id(btf_vmlinux, t->type); + if (tgt_prog) { + ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type); + if (ret > 0) { + info->btf_id = ret; + return true; + } else { + return false; + } + } + t = btf_type_by_id(btf, t->type); /* skip modifiers */ while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", @@ -3674,7 +3718,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], - __btf_name_by_offset(btf_vmlinux, t->name_off)); + __btf_name_by_offset(btf, t->name_off)); return true; } @@ -3954,6 +3998,16 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, u32 i, nargs; int ret; + if (!func) { + /* BTF function prototype doesn't match the verifier types. + * Fall back to 5 u64 args. + */ + for (i = 0; i < 5; i++) + m->arg_size[i] = 8; + m->ret_size = 8; + m->nr_args = 5; + return 0; + } args = (const struct btf_param *)(func + 1); nargs = btf_type_vlen(func); if (nargs >= MAX_BPF_FUNC_ARGS) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index da5a8b8e278f..b5945c3aaa8e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2031,6 +2031,8 @@ void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; + if (aux->linked_prog) + bpf_prog_put(aux->linked_prog); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 43ba647de720..c88c815c2154 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1577,7 +1577,7 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, - u32 btf_id) + u32 btf_id, u32 prog_fd) { switch (prog_type) { case BPF_PROG_TYPE_TRACING: @@ -1585,7 +1585,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, return -EINVAL; break; default: - if (btf_id) + if (btf_id || prog_fd) return -EINVAL; break; } @@ -1636,7 +1636,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD attach_btf_id +#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -1679,7 +1679,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, - attr->attach_btf_id)) + attr->attach_btf_id, + attr->attach_prog_fd)) return -EINVAL; /* plain bpf_prog allocation */ @@ -1689,6 +1690,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->expected_attach_type = attr->expected_attach_type; prog->aux->attach_btf_id = attr->attach_btf_id; + if (attr->attach_prog_fd) { + struct bpf_prog *tgt_prog; + + tgt_prog = bpf_prog_get(attr->attach_prog_fd); + if (IS_ERR(tgt_prog)) { + err = PTR_ERR(tgt_prog); + goto free_prog_nouncharge; + } + prog->aux->linked_prog = tgt_prog; + } prog->aux->offload_requested = !!attr->prog_ifindex; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 11910149ca2f..e9dc95a18d44 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9390,13 +9390,17 @@ static void print_verification_stats(struct bpf_verifier_env *env) static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + int ret = 0, subprog = -1, i; struct bpf_trampoline *tr; const struct btf_type *t; + bool conservative = true; const char *tname; - int ret = 0; + struct btf *btf; long addr; + u64 key; if (prog->type != BPF_PROG_TYPE_TRACING) return 0; @@ -9405,19 +9409,47 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) verbose(env, "Tracing programs must provide btf_id\n"); return -EINVAL; } - t = btf_type_by_id(btf_vmlinux, btf_id); + btf = bpf_prog_get_target_btf(prog); + if (!btf) { + verbose(env, + "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); + return -EINVAL; + } + t = btf_type_by_id(btf, btf_id); if (!t) { verbose(env, "attach_btf_id %u is invalid\n", btf_id); return -EINVAL; } - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(btf, t->name_off); if (!tname) { verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id); return -EINVAL; } + if (tgt_prog) { + struct bpf_prog_aux *aux = tgt_prog->aux; + + for (i = 0; i < aux->func_info_cnt; i++) + if (aux->func_info[i].type_id == btf_id) { + subprog = i; + break; + } + if (subprog == -1) { + verbose(env, "Subprog %s doesn't exist\n", tname); + return -EINVAL; + } + conservative = aux->func_info_aux[subprog].unreliable; + key = ((u64)aux->id) << 32 | btf_id; + } else { + key = btf_id; + } switch (prog->expected_attach_type) { case BPF_TRACE_RAW_TP: + if (tgt_prog) { + verbose(env, + "Only FENTRY/FEXIT progs are attachable to another BPF prog\n"); + return -EINVAL; + } if (!btf_type_is_typedef(t)) { verbose(env, "attach_btf_id %u is not a typedef\n", btf_id); @@ -9429,11 +9461,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } tname += sizeof(prefix) - 1; - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_ptr(t)) /* should never happen in valid vmlinux build */ return -EINVAL; - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) /* should never happen in valid vmlinux build */ return -EINVAL; @@ -9452,30 +9484,51 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) btf_id); return -EINVAL; } - t = btf_type_by_id(btf_vmlinux, t->type); + t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; - tr = bpf_trampoline_lookup(btf_id); + tr = bpf_trampoline_lookup(key); if (!tr) return -ENOMEM; prog->aux->attach_func_name = tname; + /* t is either vmlinux type or another program's type */ prog->aux->attach_func_proto = t; mutex_lock(&tr->mutex); if (tr->func.addr) { prog->aux->trampoline = tr; goto out; } - ret = btf_distill_func_proto(&env->log, btf_vmlinux, t, + if (tgt_prog && conservative) { + prog->aux->attach_func_proto = NULL; + t = NULL; + } + ret = btf_distill_func_proto(&env->log, btf, t, tname, &tr->func.model); if (ret < 0) goto out; - addr = kallsyms_lookup_name(tname); - if (!addr) { - verbose(env, - "The address of function %s cannot be found\n", - tname); - ret = -ENOENT; - goto out; + if (tgt_prog) { + if (!tgt_prog->jited) { + /* for now */ + verbose(env, "Can trace only JITed BPF progs\n"); + ret = -EINVAL; + goto out; + } + if (tgt_prog->type == BPF_PROG_TYPE_TRACING) { + /* prevent cycles */ + verbose(env, "Cannot recursively attach\n"); + ret = -EINVAL; + goto out; + } + addr = (long) tgt_prog->aux->func[subprog]->bpf_func; + } else { + addr = kallsyms_lookup_name(tname); + if (!addr) { + verbose(env, + "The address of function %s cannot be found\n", + tname); + ret = -ENOENT; + goto out; + } } tr->func.addr = (void *)addr; prog->aux->trampoline = tr; -- cgit v1.2.3 From 49cb2fc42ce4b7a656ee605e30c302efaa39c1a7 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 15 Nov 2019 13:36:20 +0100 Subject: fork: extend clone3() to support setting a PID The main motivation to add set_tid to clone3() is CRIU. To restore a process with the same PID/TID CRIU currently uses /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to ns_last_pid and then (quickly) does a clone(). This works most of the time, but it is racy. It is also slow as it requires multiple syscalls. Extending clone3() to support *set_tid makes it possible restore a process using CRIU without accessing /proc/sys/kernel/ns_last_pid and race free (as long as the desired PID/TID is available). This clone3() extension places the same restrictions (CAP_SYS_ADMIN) on clone3() with *set_tid as they are currently in place for ns_last_pid. The original version of this change was using a single value for set_tid. At the 2019 LPC, after presenting set_tid, it was, however, decided to change set_tid to an array to enable setting the PID of a process in multiple PID namespaces at the same time. If a process is created in a PID namespace it is possible to influence the PID inside and outside of the PID namespace. Details also in the corresponding selftest. To create a process with the following PIDs: PID NS level Requested PID 0 (host) 31496 1 42 2 1 For that example the two newly introduced parameters to struct clone_args (set_tid and set_tid_size) would need to be: set_tid[0] = 1; set_tid[1] = 42; set_tid[2] = 31496; set_tid_size = 3; If only the PIDs of the two innermost nested PID namespaces should be defined it would look like this: set_tid[0] = 1; set_tid[1] = 42; set_tid_size = 2; The PID of the newly created process would then be the next available free PID in the PID namespace level 0 (host) and 42 in the PID namespace at level 1 and the PID of the process in the innermost PID namespace would be 1. The set_tid array is used to specify the PID of a process starting from the innermost nested PID namespaces up to set_tid_size PID namespaces. set_tid_size cannot be larger then the current PID namespace level. Signed-off-by: Adrian Reber Reviewed-by: Christian Brauner Reviewed-by: Oleg Nesterov Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Acked-by: Andrei Vagin Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com Signed-off-by: Christian Brauner --- kernel/fork.c | 24 ++++++++++++++++- kernel/pid.c | 72 +++++++++++++++++++++++++++++++++++++++----------- kernel/pid_namespace.c | 2 -- 3 files changed, 80 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 954e875e72b1..417570263f1f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2087,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process( stackleak_task_init(p); if (pid != &init_struct_pid) { - pid = alloc_pid(p->nsproxy->pid_ns_for_children); + pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, + args->set_tid_size); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; @@ -2590,6 +2591,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, { int err; struct clone_args args; + pid_t *kset_tid = kargs->set_tid; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; @@ -2600,6 +2602,15 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, if (err) return err; + if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) + return -EINVAL; + + if (unlikely(!args.set_tid && args.set_tid_size > 0)) + return -EINVAL; + + if (unlikely(args.set_tid && args.set_tid_size == 0)) + return -EINVAL; + /* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal @@ -2617,8 +2628,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, .stack = args.stack, .stack_size = args.stack_size, .tls = args.tls, + .set_tid_size = args.set_tid_size, }; + if (args.set_tid && + copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), + (kargs->set_tid_size * sizeof(pid_t)))) + return -EFAULT; + + kargs->set_tid = kset_tid; + return 0; } @@ -2662,6 +2681,9 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) int err; struct kernel_clone_args kargs; + pid_t set_tid[MAX_PID_NS_LEVEL]; + + kargs.set_tid = set_tid; err = copy_clone_args_from_user(&kargs, uargs, size); if (err) diff --git a/kernel/pid.c b/kernel/pid.c index 7b5f6c963d72..2278e249141d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -157,7 +157,8 @@ void free_pid(struct pid *pid) call_rcu(&pid->rcu, delayed_put_pid); } -struct pid *alloc_pid(struct pid_namespace *ns) +struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, + size_t set_tid_size) { struct pid *pid; enum pid_type type; @@ -166,6 +167,17 @@ struct pid *alloc_pid(struct pid_namespace *ns) struct upid *upid; int retval = -ENOMEM; + /* + * set_tid_size contains the size of the set_tid array. Starting at + * the most nested currently active PID namespace it tells alloc_pid() + * which PID to set for a process in that most nested PID namespace + * up to set_tid_size PID namespaces. It does not have to set the PID + * for a process in all nested PID namespaces but set_tid_size must + * never be greater than the current ns->level + 1. + */ + if (set_tid_size > ns->level + 1) + return ERR_PTR(-EINVAL); + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); @@ -174,24 +186,54 @@ struct pid *alloc_pid(struct pid_namespace *ns) pid->level = ns->level; for (i = ns->level; i >= 0; i--) { - int pid_min = 1; + int tid = 0; + + if (set_tid_size) { + tid = set_tid[ns->level - i]; + + retval = -EINVAL; + if (tid < 1 || tid >= pid_max) + goto out_free; + /* + * Also fail if a PID != 1 is requested and + * no PID 1 exists. + */ + if (tid != 1 && !tmp->child_reaper) + goto out_free; + retval = -EPERM; + if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN)) + goto out_free; + set_tid_size--; + } idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); - /* - * init really needs pid 1, but after reaching the maximum - * wrap back to RESERVED_PIDS - */ - if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) - pid_min = RESERVED_PIDS; - - /* - * Store a null pointer so find_pid_ns does not find - * a partially initialized PID (see below). - */ - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, - pid_max, GFP_ATOMIC); + if (tid) { + nr = idr_alloc(&tmp->idr, NULL, tid, + tid + 1, GFP_ATOMIC); + /* + * If ENOSPC is returned it means that the PID is + * alreay in use. Return EEXIST in that case. + */ + if (nr == -ENOSPC) + nr = -EEXIST; + } else { + int pid_min = 1; + /* + * init really needs pid 1, but after reaching the + * maximum wrap back to RESERVED_PIDS + */ + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) + pid_min = RESERVED_PIDS; + + /* + * Store a null pointer so find_pid_ns does not find + * a partially initialized PID (see below). + */ + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, + pid_max, GFP_ATOMIC); + } spin_unlock_irq(&pidmap_lock); idr_preload_end(); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a6a79f85c81a..d40017e79ebe 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -26,8 +26,6 @@ static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ -#define MAX_PID_NS_LEVEL 32 /* Write once array, filled from the beginning. */ static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; -- cgit v1.2.3 From 1e0bd5a091e5d9e0f1d5b0e6329b87bb1792f784 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:02 -0800 Subject: bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never fails 92117d8443bc ("bpf: fix refcnt overflow") turned refcounting of bpf_map into potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit (32k). Due to using 32-bit counter, it's possible in practice to overflow refcounter and make it wrap around to 0, causing erroneous map free, while there are still references to it, causing use-after-free problems. But having a failing refcounting operations are problematic in some cases. One example is mmap() interface. After establishing initial memory-mapping, user is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily splitting it into multiple non-contiguous regions. All this happening without any control from the users of mmap subsystem. Rather mmap subsystem sends notifications to original creator of memory mapping through open/close callbacks, which are optionally specified during initial memory mapping creation. These callbacks are used to maintain accurate refcount for bpf_map (see next patch in this series). The problem is that open() callback is not supposed to fail, because memory-mapped resource is set up and properly referenced. This is posing a problem for using memory-mapping with BPF maps. One solution to this is to maintain separate refcount for just memory-mappings and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively. There are similar use cases in current work on tcp-bpf, necessitating extra counter as well. This seems like a rather unfortunate and ugly solution that doesn't scale well to various new use cases. Another approach to solve this is to use non-failing refcount_t type, which uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX, stays there. This utlimately causes memory leak, but prevents use after free. But given refcounting is not the most performance-critical operation with BPF maps (it's not used from running BPF program code), we can also just switch to 64-bit counter that can't overflow in practice, potentially disadvantaging 32-bit platforms a tiny bit. This simplifies semantics and allows above described scenarios to not worry about failing refcount increment operation. In terms of struct bpf_map size, we are still good and use the same amount of space: BEFORE (3 cache lines, 8 bytes of padding at the end): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic_t refcnt __attribute__((__aligned__(64))); /* 128 4 */ atomic_t usercnt; /* 132 4 */ struct work_struct work; /* 136 32 */ char name[16]; /* 168 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 146, holes: 1, sum holes: 38 */ /* padding: 8 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); AFTER (same 3 cache lines, no extra padding now): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */ atomic64_t usercnt; /* 136 8 */ struct work_struct work; /* 144 32 */ char name[16]; /* 176 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 154, holes: 1, sum holes: 38 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); This patch, while modifying all users of bpf_map_inc, also cleans up its interface to match bpf_map_put with separate operations for bpf_map_inc and bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref, respectively). Also, given there are no users of bpf_map_inc_not_zero specifying uref=true, remove uref flag and default to uref=false internally. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com --- kernel/bpf/inode.c | 2 +- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/syscall.c | 51 +++++++++++++++++++++---------------------------- kernel/bpf/verifier.c | 6 +----- kernel/bpf/xskmap.c | 6 ++---- 5 files changed, 27 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index a70f7209cda3..2f17f24258dc 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -34,7 +34,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) raw = bpf_prog_inc(raw); break; case BPF_TYPE_MAP: - raw = bpf_map_inc(raw, true); + bpf_map_inc_with_uref(raw); break; default: WARN_ON_ONCE(1); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index fab4fb134547..4cbe987be35b 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -98,7 +98,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) - inner_map = bpf_map_inc(inner_map, false); + bpf_map_inc(inner_map); else inner_map = ERR_PTR(-EINVAL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c88c815c2154..20030751b7a2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -311,7 +311,7 @@ static void bpf_map_free_deferred(struct work_struct *work) static void bpf_map_put_uref(struct bpf_map *map) { - if (atomic_dec_and_test(&map->usercnt)) { + if (atomic64_dec_and_test(&map->usercnt)) { if (map->ops->map_release_uref) map->ops->map_release_uref(map); } @@ -322,7 +322,7 @@ static void bpf_map_put_uref(struct bpf_map *map) */ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) { - if (atomic_dec_and_test(&map->refcnt)) { + if (atomic64_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map, do_idr_lock); btf_put(map->btf); @@ -575,8 +575,8 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map; - atomic_set(&map->refcnt, 1); - atomic_set(&map->usercnt, 1); + atomic64_set(&map->refcnt, 1); + atomic64_set(&map->usercnt, 1); if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; @@ -653,21 +653,19 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -/* prog's and map's refcnt limit */ -#define BPF_MAX_REFCNT 32768 - -struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) +void bpf_map_inc(struct bpf_map *map) { - if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) { - atomic_dec(&map->refcnt); - return ERR_PTR(-EBUSY); - } - if (uref) - atomic_inc(&map->usercnt); - return map; + atomic64_inc(&map->refcnt); } EXPORT_SYMBOL_GPL(bpf_map_inc); +void bpf_map_inc_with_uref(struct bpf_map *map) +{ + atomic64_inc(&map->refcnt); + atomic64_inc(&map->usercnt); +} +EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); + struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); @@ -677,38 +675,30 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) if (IS_ERR(map)) return map; - map = bpf_map_inc(map, true); + bpf_map_inc_with_uref(map); fdput(f); return map; } /* map_idr_lock should have been held */ -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, - bool uref) +static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) { int refold; - refold = atomic_fetch_add_unless(&map->refcnt, 1, 0); - - if (refold >= BPF_MAX_REFCNT) { - __bpf_map_put(map, false); - return ERR_PTR(-EBUSY); - } - + refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); - if (uref) - atomic_inc(&map->usercnt); + atomic64_inc(&map->usercnt); return map; } -struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { spin_lock_bh(&map_idr_lock); - map = __bpf_map_inc_not_zero(map, uref); + map = __bpf_map_inc_not_zero(map, false); spin_unlock_bh(&map_idr_lock); return map; @@ -1455,6 +1445,9 @@ static struct bpf_prog *____bpf_prog_get(struct fd f) return f.file->private_data; } +/* prog's refcnt limit */ +#define BPF_MAX_REFCNT 32768 + struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) { if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9dc95a18d44..9f59f7a19dd0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8179,11 +8179,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) * will be used by the valid program until it's unloaded * and all maps are released in free_used_maps() */ - map = bpf_map_inc(map, false); - if (IS_ERR(map)) { - fdput(f); - return PTR_ERR(map); - } + bpf_map_inc(map); aux->map_index = env->used_map_cnt; env->used_maps[env->used_map_cnt++] = map; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index da16c30868f3..90c4fce1c981 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -11,10 +11,8 @@ int xsk_map_inc(struct xsk_map *map) { - struct bpf_map *m = &map->map; - - m = bpf_map_inc(m, false); - return PTR_ERR_OR_ZERO(m); + bpf_map_inc(&map->map); + return 0; } void xsk_map_put(struct xsk_map *map) -- cgit v1.2.3 From 85192dbf4de08795afe2b88e52a36fc6abfc3dba Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:03 -0800 Subject: bpf: Convert bpf_prog refcnt to atomic64_t Similarly to bpf_map's refcnt/usercnt, convert bpf_prog's refcnt to atomic64 and remove artificial 32k limit. This allows to make bpf_prog's refcounting non-failing, simplifying logic of users of bpf_prog_add/bpf_prog_inc. Validated compilation by running allyesconfig kernel build. Suggested-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191117172806.2195367-3-andriin@fb.com --- kernel/bpf/inode.c | 5 +++-- kernel/bpf/syscall.c | 30 +++++++++--------------------- kernel/events/core.c | 7 ++----- 3 files changed, 14 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 2f17f24258dc..ecf42bec38c0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -31,7 +31,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: - raw = bpf_prog_inc(raw); + bpf_prog_inc(raw); break; case BPF_TYPE_MAP: bpf_map_inc_with_uref(raw); @@ -534,7 +534,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type if (!bpf_prog_get_ok(prog, &type, false)) return ERR_PTR(-EINVAL); - return bpf_prog_inc(prog); + bpf_prog_inc(prog); + return prog; } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 20030751b7a2..52fe4bacb330 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1339,7 +1339,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { - if (atomic_dec_and_test(&prog->aux->refcnt)) { + if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); @@ -1445,16 +1445,9 @@ static struct bpf_prog *____bpf_prog_get(struct fd f) return f.file->private_data; } -/* prog's refcnt limit */ -#define BPF_MAX_REFCNT 32768 - -struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) +void bpf_prog_add(struct bpf_prog *prog, int i) { - if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { - atomic_sub(i, &prog->aux->refcnt); - return ERR_PTR(-EBUSY); - } - return prog; + atomic64_add(i, &prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_add); @@ -1465,13 +1458,13 @@ void bpf_prog_sub(struct bpf_prog *prog, int i) * path holds a reference to the program, thus atomic_sub() can * be safely used in such cases! */ - WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0); + WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0); } EXPORT_SYMBOL_GPL(bpf_prog_sub); -struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +void bpf_prog_inc(struct bpf_prog *prog) { - return bpf_prog_add(prog, 1); + atomic64_inc(&prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_inc); @@ -1480,12 +1473,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) { int refold; - refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0); - - if (refold >= BPF_MAX_REFCNT) { - __bpf_prog_put(prog, false); - return ERR_PTR(-EBUSY); - } + refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); @@ -1523,7 +1511,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, goto out; } - prog = bpf_prog_inc(prog); + bpf_prog_inc(prog); out: fdput(f); return prog; @@ -1714,7 +1702,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->orig_prog = NULL; prog->jited = 0; - atomic_set(&prog->aux->refcnt, 1); + atomic64_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { diff --git a/kernel/events/core.c b/kernel/events/core.c index aec8dba2bea4..73c616876597 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10477,12 +10477,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (overflow_handler == bpf_overflow_handler) { - struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + struct bpf_prog *prog = parent_event->prog; - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto err_ns; - } + bpf_prog_inc(prog); event->prog = prog; event->orig_overflow_handler = parent_event->orig_overflow_handler; -- cgit v1.2.3 From fc9702273e2edb90400a34b3be76f7b08fa3344b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:04 -0800 Subject: bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY Add ability to memory-map contents of BPF array map. This is extremely useful for working with BPF global data from userspace programs. It allows to avoid typical bpf_map_{lookup,update}_elem operations, improving both performance and usability. There had to be special considerations for map freezing, to avoid having writable memory view into a frozen map. To solve this issue, map freezing and mmap-ing is happening under mutex now: - if map is already frozen, no writable mapping is allowed; - if map has writable memory mappings active (accounted in map->writecnt), map freezing will keep failing with -EBUSY; - once number of writable memory mappings drops to zero, map freezing can be performed again. Only non-per-CPU plain arrays are supported right now. Maps with spinlocks can't be memory mapped either. For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc() to be mmap()'able. We also need to make sure that array data memory is page-sized and page-aligned, so we over-allocate memory in such a way that struct bpf_array is at the end of a single page of memory with array->value being aligned with the start of the second page. On deallocation we need to accomodate this memory arrangement to free vmalloc()'ed memory correctly. One important consideration regarding how memory-mapping subsystem functions. Memory-mapping subsystem provides few optional callbacks, among them open() and close(). close() is called for each memory region that is unmapped, so that users can decrease their reference counters and free up resources, if necessary. open() is *almost* symmetrical: it's called for each memory region that is being mapped, **except** the very first one. So bpf_map_mmap does initial refcnt bump, while open() will do any extra ones after that. Thus number of close() calls is equal to number of open() calls plus one more. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: John Fastabend Acked-by: Johannes Weiner Link: https://lore.kernel.org/bpf/20191117172806.2195367-4-andriin@fb.com --- kernel/bpf/arraymap.c | 58 ++++++++++++++++++++++++++---- kernel/bpf/syscall.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 148 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1c65ce0098a9..a42097c36b0c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -14,7 +14,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) + (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -59,6 +59,10 @@ int array_map_alloc_check(union bpf_attr *attr) (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_flags & BPF_F_MMAPABLE) + return -EINVAL; + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. @@ -102,10 +106,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } array_size = sizeof(*array); - if (percpu) + if (percpu) { array_size += (u64) max_entries * sizeof(void *); - else - array_size += (u64) max_entries * elem_size; + } else { + /* rely on vmalloc() to return page-aligned memory and + * ensure array->value is exactly page-aligned + */ + if (attr->map_flags & BPF_F_MMAPABLE) { + array_size = PAGE_ALIGN(array_size); + array_size += PAGE_ALIGN((u64) max_entries * elem_size); + } else { + array_size += (u64) max_entries * elem_size; + } + } /* make sure there is no u32 overflow later in round_up() */ cost = array_size; @@ -117,7 +130,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size, numa_node); + if (attr->map_flags & BPF_F_MMAPABLE) { + void *data; + + /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ + data = bpf_map_area_mmapable_alloc(array_size, numa_node); + if (!data) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + array = data + PAGE_ALIGN(sizeof(struct bpf_array)) + - offsetof(struct bpf_array, value); + } else { + array = bpf_map_area_alloc(array_size, numa_node); + } if (!array) { bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); @@ -350,6 +376,11 @@ static int array_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } +static void *array_map_vmalloc_addr(struct bpf_array *array) +{ + return (void *)round_down((unsigned long)array, PAGE_SIZE); +} + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { @@ -365,7 +396,10 @@ static void array_map_free(struct bpf_map *map) if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); - bpf_map_area_free(array); + if (array->map.map_flags & BPF_F_MMAPABLE) + bpf_map_area_free(array_map_vmalloc_addr(array)); + else + bpf_map_area_free(array); } static void array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -444,6 +478,17 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } +int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; + + if (!(map->map_flags & BPF_F_MMAPABLE)) + return -EINVAL; + + return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff); +} + const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -455,6 +500,7 @@ const struct bpf_map_ops array_map_ops = { .map_gen_lookup = array_map_gen_lookup, .map_direct_value_addr = array_map_direct_value_addr, .map_direct_value_meta = array_map_direct_value_meta, + .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 52fe4bacb330..bac3becf9f90 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -127,7 +127,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -void *bpf_map_area_alloc(size_t size, int numa_node) +static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, @@ -142,18 +142,33 @@ void *bpf_map_area_alloc(size_t size, int numa_node) const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + /* kmalloc()'ed memory can't be mmap()'ed */ + if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, numa_node); if (area != NULL) return area; } - + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | + __GFP_RETRY_MAYFAIL | flags); + } return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, __builtin_return_address(0)); } +void *bpf_map_area_alloc(size_t size, int numa_node) +{ + return __bpf_map_area_alloc(size, numa_node, false); +} + +void *bpf_map_area_mmapable_alloc(size_t size, int numa_node) +{ + return __bpf_map_area_alloc(size, numa_node, true); +} + void bpf_map_area_free(void *area) { kvfree(area); @@ -425,6 +440,74 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, return -EINVAL; } +/* called for any extra memory-mapped regions (except initial) */ +static void bpf_map_mmap_open(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + + bpf_map_inc_with_uref(map); + + if (vma->vm_flags & VM_WRITE) { + mutex_lock(&map->freeze_mutex); + map->writecnt++; + mutex_unlock(&map->freeze_mutex); + } +} + +/* called for all unmapped memory region (including initial) */ +static void bpf_map_mmap_close(struct vm_area_struct *vma) +{ + struct bpf_map *map = vma->vm_file->private_data; + + if (vma->vm_flags & VM_WRITE) { + mutex_lock(&map->freeze_mutex); + map->writecnt--; + mutex_unlock(&map->freeze_mutex); + } + + bpf_map_put_with_uref(map); +} + +static const struct vm_operations_struct bpf_map_default_vmops = { + .open = bpf_map_mmap_open, + .close = bpf_map_mmap_close, +}; + +static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct bpf_map *map = filp->private_data; + int err; + + if (!map->ops->map_mmap || map_value_has_spin_lock(map)) + return -ENOTSUPP; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + mutex_lock(&map->freeze_mutex); + + if ((vma->vm_flags & VM_WRITE) && map->frozen) { + err = -EPERM; + goto out; + } + + /* set default open/close callbacks */ + vma->vm_ops = &bpf_map_default_vmops; + vma->vm_private_data = map; + + err = map->ops->map_mmap(map, vma); + if (err) + goto out; + + bpf_map_inc_with_uref(map); + + if (vma->vm_flags & VM_WRITE) + map->writecnt++; +out: + mutex_unlock(&map->freeze_mutex); + return err; +} + const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, @@ -432,6 +515,7 @@ const struct file_operations bpf_map_fops = { .release = bpf_map_release, .read = bpf_dummy_read, .write = bpf_dummy_write, + .mmap = bpf_map_mmap, }; int bpf_map_new_fd(struct bpf_map *map, int flags) @@ -577,6 +661,7 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); + mutex_init(&map->freeze_mutex); if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; @@ -1163,6 +1248,13 @@ static int map_freeze(const union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + + mutex_lock(&map->freeze_mutex); + + if (map->writecnt) { + err = -EBUSY; + goto err_put; + } if (READ_ONCE(map->frozen)) { err = -EBUSY; goto err_put; @@ -1174,6 +1266,7 @@ static int map_freeze(const union bpf_attr *attr) WRITE_ONCE(map->frozen, true); err_put: + mutex_unlock(&map->freeze_mutex); fdput(f); return err; } -- cgit v1.2.3 From b2e2f0e6a6f910c906c083584b6e0afd12266f22 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 19 Nov 2019 22:21:13 +0800 Subject: bpf: Make array_map_mmap static Fix sparse warning: kernel/bpf/arraymap.c:481:5: warning: symbol 'array_map_mmap' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191119142113.15388-1-yuehaibing@huawei.com --- kernel/bpf/arraymap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index a42097c36b0c..633c8c701ff6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -478,7 +478,7 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } -int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_array *array = container_of(map, struct bpf_array, map); pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; -- cgit v1.2.3 From 05ff1ba412fd6bd48d56dd4c0baff626533728cc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Nov 2019 10:33:34 +0100 Subject: PM: QoS: Invalidate frequency QoS requests after removal Switching cpufreq drivers (or switching operation modes of the intel_pstate driver from "active" to "passive" and vice versa) does not work on some x86 systems with ACPI after commit 3000ce3c52f8 ("cpufreq: Use per-policy frequency QoS"), because the ACPI _PPC and thermal code uses the same frequency QoS request object for a given CPU every time a cpufreq driver is registered and freq_qos_remove_request() does not invalidate the request after removing it from its QoS list, so freq_qos_add_request() complains and fails when that request is passed to it again. Fix the issue by modifying freq_qos_remove_request() to clear the qos and type fields of the frequency request pointed to by its argument after removing it from its QoS list so as to invalidate it. Fixes: 3000ce3c52f8 ("cpufreq: Use per-policy frequency QoS") Reported-and-tested-by: Doug Smythies Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- kernel/power/qos.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 04e83fdfbe80..a45cba7df0ae 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -814,6 +814,8 @@ EXPORT_SYMBOL_GPL(freq_qos_update_request); */ int freq_qos_remove_request(struct freq_qos_request *req) { + int ret; + if (!req) return -EINVAL; @@ -821,7 +823,11 @@ int freq_qos_remove_request(struct freq_qos_request *req) "%s() called for unknown object\n", __func__)) return -EINVAL; - return freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); + ret = freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); + req->qos = NULL; + req->type = 0; + + return ret; } EXPORT_SYMBOL_GPL(freq_qos_remove_request); -- cgit v1.2.3 From 9e77716a75bc6cf54965e5ec069ba7c02b32251c Mon Sep 17 00:00:00 2001 From: Luc Van Oostenryck Date: Wed, 20 Nov 2019 01:33:20 +0100 Subject: fork: fix pidfd_poll()'s return type pidfd_poll() is defined as returning 'unsigned int' but the .poll method is declared as returning '__poll_t', a bitwise type. Fix this by using the proper return type and using the EPOLL constants instead of the POLL ones, as required for __poll_t. Fixes: b53b0b9d9a61 ("pidfd: add polling support") Cc: Joel Fernandes (Google) Cc: stable@vger.kernel.org # 5.3 Signed-off-by: Luc Van Oostenryck Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191120003320.31138-1-luc.vanoostenryck@gmail.com Signed-off-by: Christian Brauner --- kernel/fork.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 55af6931c6ec..13b38794efb5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1708,11 +1708,11 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) /* * Poll support for process exit notification. */ -static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) +static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct task_struct *task; struct pid *pid = file->private_data; - int poll_flags = 0; + __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); @@ -1724,7 +1724,7 @@ static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) * group, then poll(2) should block, similar to the wait(2) family. */ if (!task || (task->exit_state && thread_group_empty(task))) - poll_flags = POLLIN | POLLRDNORM; + poll_flags = EPOLLIN | EPOLLRDNORM; rcu_read_unlock(); return poll_flags; -- cgit v1.2.3 From 91e6015b082b08a74e5d9d326f651e5890a93519 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Nov 2019 22:38:16 +0100 Subject: bpf: Emit audit messages upon successful prog load and unload Allow for audit messages to be emitted upon BPF program load and unload for having a timeline of events. The load itself is in syscall context, so additional info about the process initiating the BPF prog creation can be logged and later directly correlated to the unload event. The only info really needed from BPF side is the globally unique prog ID where then audit user space tooling can query / dump all info needed about the specific BPF program right upon load event and enrich the record, thus these changes needed here can be kept small and non-intrusive to the core. Raw example output: # auditctl -D # auditctl -a always,exit -F arch=x86_64 -S bpf # ausearch --start recent -m 1334 [...] ---- time->Wed Nov 20 12:45:51 2019 type=PROCTITLE msg=audit(1574271951.590:8974): proctitle="./test_verifier" type=SYSCALL msg=audit(1574271951.590:8974): arch=c000003e syscall=321 success=yes exit=14 a0=5 a1=7ffe2d923e80 a2=78 a3=0 items=0 ppid=742 pid=949 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=2 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1334] msg=audit(1574271951.590:8974): auid=0 uid=0 gid=0 ses=2 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 pid=949 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" prog-id=3260 event=LOAD ---- time->Wed Nov 20 12:45:51 2019 type=UNKNOWN[1334] msg=audit(1574271951.590:8975): prog-id=3260 event=UNLOAD ---- [...] Signed-off-by: Daniel Borkmann Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191120213816.8186-1-jolsa@kernel.org --- kernel/auditsc.c | 2 +- kernel/bpf/syscall.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4effe01ebbe2..9bf1045fedfa 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2545,7 +2545,7 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); } -static void audit_log_task(struct audit_buffer *ab) +void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bac3becf9f90..17f4254495f2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ @@ -1318,6 +1319,34 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +enum bpf_event { + BPF_EVENT_LOAD, + BPF_EVENT_UNLOAD, +}; + +static const char * const bpf_event_audit_str[] = { + [BPF_EVENT_LOAD] = "LOAD", + [BPF_EVENT_UNLOAD] = "UNLOAD", +}; + +static void bpf_audit_prog(const struct bpf_prog *prog, enum bpf_event event) +{ + bool has_task_context = event == BPF_EVENT_LOAD; + struct audit_buffer *ab; + + if (audit_enabled == AUDIT_OFF) + return; + ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_BPF); + if (unlikely(!ab)) + return; + if (has_task_context) + audit_log_task(ab); + audit_log_format(ab, "%sprog-id=%u event=%s", + has_task_context ? " " : "", + prog->aux->id, bpf_event_audit_str[event]); + audit_log_end(ab); +} + int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1434,6 +1463,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); + bpf_audit_prog(prog, BPF_EVENT_UNLOAD); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); __bpf_prog_put_noref(prog, true); @@ -1843,6 +1873,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + bpf_audit_prog(prog, BPF_EVENT_LOAD); err = bpf_prog_new_fd(prog); if (err < 0) -- cgit v1.2.3 From 196e8ca74886c433dcfc64a809707074b936aaf5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Nov 2019 23:04:44 +0100 Subject: bpf: Switch bpf_map_{area_alloc,area_mmapable_alloc}() to u64 size Given we recently extended the original bpf_map_area_alloc() helper in commit fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY"), we need to apply the same logic as in ff1c08e1f74b ("bpf: Change size to u64 for bpf_map_{area_alloc, charge_init}()"). To avoid conflicts, extend it for bpf-next. Reported-by: Stephen Rothwell Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 17f4254495f2..b51ecb9644d0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -128,7 +128,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) +static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, @@ -143,6 +143,9 @@ static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; + if (size >= SIZE_MAX) + return NULL; + /* kmalloc()'ed memory can't be mmap()'ed */ if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, @@ -160,12 +163,12 @@ static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable) flags, __builtin_return_address(0)); } -void *bpf_map_area_alloc(size_t size, int numa_node) +void *bpf_map_area_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, false); } -void *bpf_map_area_mmapable_alloc(size_t size, int numa_node) +void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, true); } @@ -214,7 +217,7 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) atomic_long_sub(pages, &user->locked_vm); } -int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) +int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size) { u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; struct user_struct *user; -- cgit v1.2.3 From 84bb46cd62283cc371769ec1f77ff7924099f584 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 23 Nov 2019 09:54:58 -0800 Subject: Revert "bpf: Emit audit messages upon successful prog load and unload" This commit reverts commit 91e6015b082b ("bpf: Emit audit messages upon successful prog load and unload") and its follow up commit 7599a896f2e4 ("audit: Move audit_log_task declaration under CONFIG_AUDITSYSCALL") as requested by Paul Moore. The change needs close review on linux-audit, tests etc. Signed-off-by: Jakub Kicinski --- kernel/auditsc.c | 2 +- kernel/bpf/syscall.c | 31 ------------------------------- 2 files changed, 1 insertion(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9bf1045fedfa..4effe01ebbe2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2545,7 +2545,7 @@ void __audit_ntp_log(const struct audit_ntp_data *ad) audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); } -void audit_log_task(struct audit_buffer *ab) +static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b51ecb9644d0..4ae52eb05f41 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ @@ -1322,34 +1321,6 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } -enum bpf_event { - BPF_EVENT_LOAD, - BPF_EVENT_UNLOAD, -}; - -static const char * const bpf_event_audit_str[] = { - [BPF_EVENT_LOAD] = "LOAD", - [BPF_EVENT_UNLOAD] = "UNLOAD", -}; - -static void bpf_audit_prog(const struct bpf_prog *prog, enum bpf_event event) -{ - bool has_task_context = event == BPF_EVENT_LOAD; - struct audit_buffer *ab; - - if (audit_enabled == AUDIT_OFF) - return; - ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_BPF); - if (unlikely(!ab)) - return; - if (has_task_context) - audit_log_task(ab); - audit_log_format(ab, "%sprog-id=%u event=%s", - has_task_context ? " " : "", - prog->aux->id, bpf_event_audit_str[event]); - audit_log_end(ab); -} - int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1466,7 +1437,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); - bpf_audit_prog(prog, BPF_EVENT_UNLOAD); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); __bpf_prog_put_noref(prog, true); @@ -1876,7 +1846,6 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); - bpf_audit_prog(prog, BPF_EVENT_LOAD); err = bpf_prog_new_fd(prog); if (err < 0) -- cgit v1.2.3 From 071cdecec57fb5d5df78e6a12114ad7bccea5b0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 21 Nov 2019 14:36:12 +0100 Subject: xdp: Fix cleanup on map free for devmap_hash map type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tetsuo pointed out that it was not only the device unregister hook that was broken for devmap_hash types, it was also cleanup on map free. So better fix this as well. While we're at it, there's no reason to allocate the netdev_map array for DEVMAP_HASH, so skip that and adjust the cost accordingly. Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index") Reported-by: Tetsuo Handa Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20191121133612.430414-1-toke@redhat.com --- kernel/bpf/devmap.c | 74 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3867864cdc2f..3d3d61b5985b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -74,7 +74,7 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; - struct bpf_dtab_netdev **netdev_map; + struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ struct list_head __percpu *flush_list; struct list_head list; @@ -101,6 +101,12 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries) return hash; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { int err, cpu; @@ -120,8 +126,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); /* make sure page count doesn't overflow */ - cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += sizeof(struct list_head) * num_possible_cpus(); + cost = (u64) sizeof(struct list_head) * num_possible_cpus(); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); @@ -129,6 +134,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; + } else { + cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); } /* if map size is larger than memlock limit, reject it */ @@ -143,24 +150,22 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * - sizeof(struct bpf_dtab_netdev *), - dtab->map.numa_node); - if (!dtab->netdev_map) - goto free_percpu; - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); if (!dtab->dev_index_head) - goto free_map_area; + goto free_percpu; spin_lock_init(&dtab->index_lock); + } else { + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + sizeof(struct bpf_dtab_netdev *), + dtab->map.numa_node); + if (!dtab->netdev_map) + goto free_percpu; } return 0; -free_map_area: - bpf_map_area_free(dtab->netdev_map); free_percpu: free_percpu(dtab->flush_list); free_charge: @@ -228,21 +233,40 @@ static void dev_map_free(struct bpf_map *map) cond_resched(); } - for (i = 0; i < dtab->map.max_entries; i++) { - struct bpf_dtab_netdev *dev; + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + for (i = 0; i < dtab->n_buckets; i++) { + struct bpf_dtab_netdev *dev; + struct hlist_head *head; + struct hlist_node *next; - dev = dtab->netdev_map[i]; - if (!dev) - continue; + head = dev_map_index_hash(dtab, i); - free_percpu(dev->bulkq); - dev_put(dev->dev); - kfree(dev); + hlist_for_each_entry_safe(dev, next, head, index_hlist) { + hlist_del_rcu(&dev->index_hlist); + free_percpu(dev->bulkq); + dev_put(dev->dev); + kfree(dev); + } + } + + kfree(dtab->dev_index_head); + } else { + for (i = 0; i < dtab->map.max_entries; i++) { + struct bpf_dtab_netdev *dev; + + dev = dtab->netdev_map[i]; + if (!dev) + continue; + + free_percpu(dev->bulkq); + dev_put(dev->dev); + kfree(dev); + } + + bpf_map_area_free(dtab->netdev_map); } free_percpu(dtab->flush_list); - bpf_map_area_free(dtab->netdev_map); - kfree(dtab->dev_index_head); kfree(dtab); } @@ -263,12 +287,6 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, - int idx) -{ - return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; -} - struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); -- cgit v1.2.3 From 581738a681b6faae5725c2555439189ca81c0f1f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Nov 2019 09:06:50 -0800 Subject: bpf: Provide better register bounds after jmp32 instructions With latest llvm (trunk https://github.com/llvm/llvm-project), test_progs, which has +alu32 enabled, failed for strobemeta.o. The verifier output looks like below with edit to replace large decimal numbers with hex ones. 193: (85) call bpf_probe_read_user_str#114 R0=inv(id=0) 194: (26) if w0 > 0x1 goto pc+4 R0_w=inv(id=0,umax_value=0xffffffff00000001) 195: (6b) *(u16 *)(r7 +80) = r0 196: (bc) w6 = w0 R6_w=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) 197: (67) r6 <<= 32 R6_w=inv(id=0,smax_value=0x7fffffff00000000,umax_value=0xffffffff00000000, var_off=(0x0; 0xffffffff00000000)) 198: (77) r6 >>= 32 R6=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 201: (79) r8 = *(u64 *)(r10 -416) R8_w=map_value(id=0,off=40,ks=4,vs=13872,imm=0) 202: (0f) r8 += r6 R8_w=map_value(id=0,off=40,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) 203: (07) r8 += 9696 R8_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 255: (bf) r1 = r8 R1_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=0xffffffff,var_off=(0x0; 0xffffffff)) ... 257: (85) call bpf_probe_read_user_str#114 R1 unbounded memory access, make sure to bounds check any array access into a map The value range for register r6 at insn 198 should be really just 0/1. The umax_value=0xffffffff caused later verification failure. After jmp instructions, the current verifier already tried to use just obtained information to get better register range. The current mechanism is for 64bit register only. This patch implemented to tighten the range for 32bit sub-registers after jmp32 instructions. With the patch, we have the below range ranges for the above code sequence: 193: (85) call bpf_probe_read_user_str#114 R0=inv(id=0) 194: (26) if w0 > 0x1 goto pc+4 R0_w=inv(id=0,smax_value=0x7fffffff00000001,umax_value=0xffffffff00000001, var_off=(0x0; 0xffffffff00000001)) 195: (6b) *(u16 *)(r7 +80) = r0 196: (bc) w6 = w0 R6_w=inv(id=0,umax_value=0xffffffff,var_off=(0x0; 0x1)) 197: (67) r6 <<= 32 R6_w=inv(id=0,umax_value=0x100000000,var_off=(0x0; 0x100000000)) 198: (77) r6 >>= 32 R6=inv(id=0,umax_value=1,var_off=(0x0; 0x1)) ... 201: (79) r8 = *(u64 *)(r10 -416) R8_w=map_value(id=0,off=40,ks=4,vs=13872,imm=0) 202: (0f) r8 += r6 R8_w=map_value(id=0,off=40,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) 203: (07) r8 += 9696 R8_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) ... 255: (bf) r1 = r8 R1_w=map_value(id=0,off=9736,ks=4,vs=13872,umax_value=1,var_off=(0x0; 0x1)) ... 257: (85) call bpf_probe_read_user_str#114 ... At insn 194, the register R0 has better var_off.mask and smax_value. Especially, the var_off.mask ensures later lshift and rshift maintains proper value range. Suggested-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191121170650.449030-1-yhs@fb.com --- kernel/bpf/verifier.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9f59f7a19dd0..fc85714428c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1007,6 +1007,17 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->umax_value)); } +static void __reg_bound_offset32(struct bpf_reg_state *reg) +{ + u64 mask = 0xffffFFFF; + struct tnum range = tnum_range(reg->umin_value & mask, + reg->umax_value & mask); + struct tnum lo32 = tnum_cast(reg->var_off, 4); + struct tnum hi32 = tnum_lshift(tnum_rshift(reg->var_off, 32), 32); + + reg->var_off = tnum_or(hi32, tnum_intersect(lo32, range)); +} + /* Reset the min/max bounds of a register */ static void __mark_reg_unbounded(struct bpf_reg_state *reg) { @@ -5589,6 +5600,10 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); + if (is_jmp32) { + __reg_bound_offset32(false_reg); + __reg_bound_offset32(true_reg); + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. @@ -5698,6 +5713,10 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, /* We might have learned some bits from the bounds. */ __reg_bound_offset(false_reg); __reg_bound_offset(true_reg); + if (is_jmp32) { + __reg_bound_offset32(false_reg); + __reg_bound_offset32(true_reg); + } /* Intersecting with the old var_off might have improved our bounds * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), * then new var_off is (0; 0x7f...fc) which improves our umax. -- cgit v1.2.3 From 6332be04c039a72fca32ed0a4265bac58d606bb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:55 +0100 Subject: bpf: Move bpf_free_used_maps into sleepable section We later on are going to need a sleepable context as opposed to plain RCU callback in order to untrack programs we need to poke at runtime and tracking as well as image update is performed under mutex. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/09823b1d5262876e9b83a8e75df04cf0467357a4.1574452833.git.daniel@iogearbox.net --- kernel/bpf/core.c | 23 +++++++++++++++++++++++ kernel/bpf/syscall.c | 20 -------------------- 2 files changed, 23 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b5945c3aaa8e..0e825c164f1a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2003,12 +2003,35 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array, : 0; } +static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) +{ + enum bpf_cgroup_storage_type stype; + + for_each_cgroup_storage_type(stype) { + if (!aux->cgroup_storage[stype]) + continue; + bpf_cgroup_storage_release(aux->prog, + aux->cgroup_storage[stype]); + } +} + +static void bpf_free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + bpf_free_cgroup_storage(aux); + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + kfree(aux->used_maps); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; int i; aux = container_of(work, struct bpf_prog_aux, work); + bpf_free_used_maps(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ae52eb05f41..373778da8489 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1302,25 +1302,6 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return 0; } -/* drop refcnt on maps used by eBPF program and free auxilary data */ -static void free_used_maps(struct bpf_prog_aux *aux) -{ - enum bpf_cgroup_storage_type stype; - int i; - - for_each_cgroup_storage_type(stype) { - if (!aux->cgroup_storage[stype]) - continue; - bpf_cgroup_storage_release(aux->prog, - aux->cgroup_storage[stype]); - } - - for (i = 0; i < aux->used_map_cnt; i++) - bpf_map_put(aux->used_maps[i]); - - kfree(aux->used_maps); -} - int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1415,7 +1396,6 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); - free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); security_bpf_prog_free(aux); bpf_prog_free(aux->prog); -- cgit v1.2.3 From 2beee5f57441413b64a9c2bd657e17beabb98d1c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:56 +0100 Subject: bpf: Move owner type, jited info into array auxiliary data We're going to extend this with further information which is only relevant for prog array at this point. Given this info is not used in critical path, move it into its own structure such that the main array map structure can be kept on diet. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b9ddccdb0f6f7026489ee955f16c96381e1e7238.1574452833.git.daniel@iogearbox.net --- kernel/bpf/arraymap.c | 32 ++++++++++++++++++++++++++++++-- kernel/bpf/core.c | 11 +++++------ kernel/bpf/map_in_map.c | 5 ++--- kernel/bpf/syscall.c | 16 ++++++---------- 4 files changed, 43 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 633c8c701ff6..57da950ee55b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -671,10 +671,38 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array_aux *aux; + struct bpf_map *map; + + aux = kzalloc(sizeof(*aux), GFP_KERNEL); + if (!aux) + return ERR_PTR(-ENOMEM); + + map = array_map_alloc(attr); + if (IS_ERR(map)) { + kfree(aux); + return map; + } + + container_of(map, struct bpf_array, map)->aux = aux; + return map; +} + +static void prog_array_map_free(struct bpf_map *map) +{ + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + kfree(aux); + fd_array_map_free(map); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, - .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_alloc = prog_array_map_alloc, + .map_free = prog_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0e825c164f1a..07af9c1d9cf1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1691,18 +1691,17 @@ bool bpf_prog_array_compatible(struct bpf_array *array, if (fp->kprobe_override) return false; - if (!array->owner_prog_type) { + if (!array->aux->type) { /* There's no owner yet where we could check for * compatibility. */ - array->owner_prog_type = fp->type; - array->owner_jited = fp->jited; - + array->aux->type = fp->type; + array->aux->jited = fp->jited; return true; } - return array->owner_prog_type == fp->type && - array->owner_jited == fp->jited; + return array->aux->type == fp->type && + array->aux->jited == fp->jited; } static int bpf_check_tail_call(const struct bpf_prog *fp) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 4cbe987be35b..5e9366b33f0f 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -17,9 +17,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) if (IS_ERR(inner_map)) return inner_map; - /* prog_array->owner_prog_type and owner_jited - * is a runtime binding. Doing static check alone - * in the verifier is not enough. + /* prog_array->aux->{type,jited} is a runtime binding. + * Doing static check alone in the verifier is not enough. */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 373778da8489..b904d56ec686 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -389,13 +389,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; const struct bpf_array *array; - u32 owner_prog_type = 0; - u32 owner_jited = 0; + u32 type = 0, jited = 0; if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); - owner_prog_type = array->owner_prog_type; - owner_jited = array->owner_jited; + type = array->aux->type; + jited = array->aux->jited; } seq_printf(m, @@ -415,12 +414,9 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->memory.pages * 1ULL << PAGE_SHIFT, map->id, READ_ONCE(map->frozen)); - - if (owner_prog_type) { - seq_printf(m, "owner_prog_type:\t%u\n", - owner_prog_type); - seq_printf(m, "owner_jited:\t%u\n", - owner_jited); + if (type) { + seq_printf(m, "owner_prog_type:\t%u\n", type); + seq_printf(m, "owner_jited:\t%u\n", jited); } } #endif -- cgit v1.2.3 From a66886fe6c24ebeeb6dc10fbd9b75158029eacf7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:57 +0100 Subject: bpf: Add initial poke descriptor table for jit images Add initial poke table data structures and management to the BPF prog that can later be used by JITs. Also add an instance of poke specific data for tail call maps; plan for later work is to extend this also for BPF static keys. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1db285ec2ea4207ee0455b3f8e191a4fc58b9ade.1574452833.git.daniel@iogearbox.net --- kernel/bpf/core.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 07af9c1d9cf1..608b7085e0c9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -256,6 +256,7 @@ void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { free_percpu(fp->aux->stats); + kfree(fp->aux->poke_tab); kfree(fp->aux); } vfree(fp); @@ -756,6 +757,39 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, return ret; } +int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke) +{ + struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; + static const u32 poke_tab_max = 1024; + u32 slot = prog->aux->size_poke_tab; + u32 size = slot + 1; + + if (size > poke_tab_max) + return -ENOSPC; + if (poke->ip || poke->ip_stable || poke->adj_off) + return -EINVAL; + + switch (poke->reason) { + case BPF_POKE_REASON_TAIL_CALL: + if (!poke->tail_call.map) + return -EINVAL; + break; + default: + return -EINVAL; + } + + tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL); + if (!tab) + return -ENOMEM; + + memcpy(&tab[slot], poke, sizeof(*poke)); + prog->aux->size_poke_tab = size; + prog->aux->poke_tab = tab; + + return slot; +} + static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, -- cgit v1.2.3 From da765a2f599304a81a25e77908d1790414ecdbb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:58 +0100 Subject: bpf: Add poke dependency tracking for prog array maps This work adds program tracking to prog array maps. This is needed such that upon prog array updates/deletions we can fix up all programs which make use of this tail call map. We add ops->map_poke_{un,}track() helpers to maps to maintain the list of programs and ops->map_poke_run() for triggering the actual update. bpf_array_aux is extended to contain the list head and poke_mutex in order to serialize program patching during updates/deletions. bpf_free_used_maps() will untrack the program shortly before dropping the reference to the map. For clearing out the prog array once all urefs are dropped we need to use schedule_work() to have a sleepable context. The prog_array_map_poke_run() is triggered during updates/deletions and walks the maintained prog list. It checks in their poke_tabs whether the map and key is matching and runs the actual bpf_arch_text_poke() for patching in the nop or new jmp location. Depending on the type of update, we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1fb364bb3c565b3e415d5ea348f036ff379e779d.1574452833.git.daniel@iogearbox.net --- kernel/bpf/arraymap.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/bpf/core.c | 9 ++- kernel/bpf/syscall.c | 20 ++++-- 3 files changed, 200 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 57da950ee55b..58bdf5fd24cc 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); - old_ptr = xchg(array->ptrs + index, new_ptr); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, new_ptr); + map->ops->map_poke_run(map, index, old_ptr, new_ptr); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, new_ptr); + } + if (old_ptr) map->ops->map_fd_put_ptr(old_ptr); - return 0; } @@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) if (index >= array->map.max_entries) return -E2BIG; - old_ptr = xchg(array->ptrs + index, NULL); + if (map->ops->map_poke_run) { + mutex_lock(&array->aux->poke_mutex); + old_ptr = xchg(array->ptrs + index, NULL); + map->ops->map_poke_run(map, index, old_ptr, NULL); + mutex_unlock(&array->aux->poke_mutex); + } else { + old_ptr = xchg(array->ptrs + index, NULL); + } + if (old_ptr) { map->ops->map_fd_put_ptr(old_ptr); return 0; @@ -671,6 +686,152 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +struct prog_poke_elem { + struct list_head list; + struct bpf_prog_aux *aux; +}; + +static int prog_array_map_poke_track(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + int ret = 0; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry(elem, &aux->poke_progs, list) { + if (elem->aux == prog_aux) + goto out; + } + + elem = kmalloc(sizeof(*elem), GFP_KERNEL); + if (!elem) { + ret = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&elem->list); + /* We must track the program's aux info at this point in time + * since the program pointer itself may not be stable yet, see + * also comment in prog_array_map_poke_run(). + */ + elem->aux = prog_aux; + + list_add_tail(&elem->list, &aux->poke_progs); +out: + mutex_unlock(&aux->poke_mutex); + return ret; +} + +static void prog_array_map_poke_untrack(struct bpf_map *map, + struct bpf_prog_aux *prog_aux) +{ + struct prog_poke_elem *elem, *tmp; + struct bpf_array_aux *aux; + + aux = container_of(map, struct bpf_array, map)->aux; + mutex_lock(&aux->poke_mutex); + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + if (elem->aux == prog_aux) { + list_del_init(&elem->list); + kfree(elem); + break; + } + } + mutex_unlock(&aux->poke_mutex); +} + +static void prog_array_map_poke_run(struct bpf_map *map, u32 key, + struct bpf_prog *old, + struct bpf_prog *new) +{ + enum bpf_text_poke_type type; + struct prog_poke_elem *elem; + struct bpf_array_aux *aux; + + if (!old && new) + type = BPF_MOD_NOP_TO_JUMP; + else if (old && !new) + type = BPF_MOD_JUMP_TO_NOP; + else if (old && new) + type = BPF_MOD_JUMP_TO_JUMP; + else + return; + + aux = container_of(map, struct bpf_array, map)->aux; + WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); + + list_for_each_entry(elem, &aux->poke_progs, list) { + struct bpf_jit_poke_descriptor *poke; + int i, ret; + + for (i = 0; i < elem->aux->size_poke_tab; i++) { + poke = &elem->aux->poke_tab[i]; + + /* Few things to be aware of: + * + * 1) We can only ever access aux in this context, but + * not aux->prog since it might not be stable yet and + * there could be danger of use after free otherwise. + * 2) Initially when we start tracking aux, the program + * is not JITed yet and also does not have a kallsyms + * entry. We skip these as poke->ip_stable is not + * active yet. The JIT will do the final fixup before + * setting it stable. The various poke->ip_stable are + * successively activated, so tail call updates can + * arrive from here while JIT is still finishing its + * final fixup for non-activated poke entries. + * 3) On program teardown, the program's kallsym entry gets + * removed out of RCU callback, but we can only untrack + * from sleepable context, therefore bpf_arch_text_poke() + * might not see that this is in BPF text section and + * bails out with -EINVAL. As these are unreachable since + * RCU grace period already passed, we simply skip them. + * 4) Also programs reaching refcount of zero while patching + * is in progress is okay since we're protected under + * poke_mutex and untrack the programs before the JIT + * buffer is freed. When we're still in the middle of + * patching and suddenly kallsyms entry of the program + * gets evicted, we just skip the rest which is fine due + * to point 3). + * 5) Any other error happening below from bpf_arch_text_poke() + * is a unexpected bug. + */ + if (!READ_ONCE(poke->ip_stable)) + continue; + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + if (poke->tail_call.map != map || + poke->tail_call.key != key) + continue; + + ret = bpf_arch_text_poke(poke->ip, type, + old ? (u8 *)old->bpf_func + + poke->adj_off : NULL, + new ? (u8 *)new->bpf_func + + poke->adj_off : NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } +} + +static void prog_array_map_clear_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_array_aux, + work)->map; + bpf_fd_array_map_clear(map); + bpf_map_put(map); +} + +static void prog_array_map_clear(struct bpf_map *map) +{ + struct bpf_array_aux *aux = container_of(map, struct bpf_array, + map)->aux; + bpf_map_inc(map); + schedule_work(&aux->work); +} + static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) { struct bpf_array_aux *aux; @@ -680,6 +841,10 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) if (!aux) return ERR_PTR(-ENOMEM); + INIT_WORK(&aux->work, prog_array_map_clear_deferred); + INIT_LIST_HEAD(&aux->poke_progs); + mutex_init(&aux->poke_mutex); + map = array_map_alloc(attr); if (IS_ERR(map)) { kfree(aux); @@ -687,14 +852,21 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) } container_of(map, struct bpf_array, map)->aux = aux; + aux->map = map; + return map; } static void prog_array_map_free(struct bpf_map *map) { + struct prog_poke_elem *elem, *tmp; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; + list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { + list_del_init(&elem->list); + kfree(elem); + } kfree(aux); fd_array_map_free(map); } @@ -703,13 +875,16 @@ const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, .map_free = prog_array_map_free, + .map_poke_track = prog_array_map_poke_track, + .map_poke_untrack = prog_array_map_poke_untrack, + .map_poke_run = prog_array_map_poke_run, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, - .map_release_uref = bpf_fd_array_map_clear, + .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 608b7085e0c9..49e32acad7d8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2050,11 +2050,16 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux) static void bpf_free_used_maps(struct bpf_prog_aux *aux) { + struct bpf_map *map; int i; bpf_free_cgroup_storage(aux); - for (i = 0; i < aux->used_map_cnt; i++) - bpf_map_put(aux->used_maps[i]); + for (i = 0; i < aux->used_map_cnt; i++) { + map = aux->used_maps[i]; + if (map->ops->map_poke_untrack) + map->ops->map_poke_untrack(map, aux); + bpf_map_put(map); + } kfree(aux->used_maps); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b904d56ec686..e3461ec59570 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,12 +25,13 @@ #include #include -#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ - (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) -#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) +#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ + IS_FD_HASH(map)) #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) @@ -877,7 +878,7 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); - } else if (IS_FD_ARRAY(map)) { + } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); @@ -1004,6 +1005,10 @@ static int map_update_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_SOCKMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; + } else if (IS_FD_PROG_ARRAY(map)) { + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + attr->flags); + goto out; } /* must increment bpf_prog_active to avoid kprobe+bpf triggering from @@ -1086,6 +1091,9 @@ static int map_delete_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; + } else if (IS_FD_PROG_ARRAY(map)) { + err = map->ops->map_delete_elem(map, key); + goto out; } preempt_disable(); -- cgit v1.2.3 From d2e4c1e6c2947269346054ac8937ccfe9e0bcc6b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:59 +0100 Subject: bpf: Constant map key tracking for prog array pokes Add tracking of constant keys into tail call maps. The signature of bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3 is a index key. The direct call approach for tail calls can be enabled if the verifier asserted that for all branches leading to the tail call helper invocation, the map pointer and index key were both constant and the same. Tracking of map pointers we already do from prior work via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/ delete calls on maps"). Given the tail call map index key is not on stack but directly in the register, we can add similar tracking approach and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab with the relevant information for the JITing phase. We internally reuse insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order to point into the prog's poke_tab, and keep insn->imm as 0 as indicator that current indirect tail call emission must be used. Note that publishing to the tracker must happen at the end of fixup_bpf_calls() since adding elements to the poke_tab reallocates its memory, so we need to wait until its in final state. Future work can generalize and add similar approach to optimize plain array map lookups. Difference there is that we need to look into the key value that sits on stack. For clarity in bpf_insn_aux_data, map_state has been renamed into map_ptr_state, so we get map_{ptr,key}_state as trackers. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/e8db37f6b2ae60402fa40216c96738ee9b316c32.1574452833.git.daniel@iogearbox.net --- kernel/bpf/verifier.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 111 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fc85714428c7..a0482e1c4a77 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -171,6 +171,9 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 +#define BPF_MAP_KEY_POISON (1ULL << 63) +#define BPF_MAP_KEY_SEEN (1ULL << 62) + #define BPF_MAP_PTR_UNPRIV 1UL #define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ POISON_POINTER_DELTA)) @@ -178,12 +181,12 @@ struct bpf_verifier_stack_elem { static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { - return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON; + return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; } static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) { - return aux->map_state & BPF_MAP_PTR_UNPRIV; + return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV; } static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, @@ -191,8 +194,31 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, { BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); unpriv |= bpf_map_ptr_unpriv(aux); - aux->map_state = (unsigned long)map | - (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); + aux->map_ptr_state = (unsigned long)map | + (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); +} + +static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) +{ + return aux->map_key_state & BPF_MAP_KEY_POISON; +} + +static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux) +{ + return !(aux->map_key_state & BPF_MAP_KEY_SEEN); +} + +static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux) +{ + return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON); +} + +static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) +{ + bool poisoned = bpf_map_key_poisoned(aux); + + aux->map_key_state = state | BPF_MAP_KEY_SEEN | + (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } struct bpf_call_arg_meta { @@ -4090,15 +4116,49 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EACCES; } - if (!BPF_MAP_PTR(aux->map_state)) + if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, meta->map_ptr->unpriv_array); - else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr) + else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, meta->map_ptr->unpriv_array); return 0; } +static int +record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, + int func_id, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_reg_state *regs = cur_regs(env), *reg; + struct bpf_map *map = meta->map_ptr; + struct tnum range; + u64 val; + + if (func_id != BPF_FUNC_tail_call) + return 0; + if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) { + verbose(env, "kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + + range = tnum_range(0, map->max_entries - 1); + reg = ®s[BPF_REG_3]; + + if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) { + bpf_map_key_store(aux, BPF_MAP_KEY_POISON); + return 0; + } + + val = reg->var_off.value; + if (bpf_map_key_unseen(aux)) + bpf_map_key_store(aux, val); + else if (!bpf_map_key_poisoned(aux) && + bpf_map_key_immediate(aux) != val) + bpf_map_key_store(aux, BPF_MAP_KEY_POISON); + return 0; +} + static int check_reference_leak(struct bpf_verifier_env *env) { struct bpf_func_state *state = cur_func(env); @@ -4173,6 +4233,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + err = record_func_key(env, &meta, func_id, insn_idx); + if (err) + return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ @@ -9065,6 +9129,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) static int fixup_bpf_calls(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + bool expect_blinding = bpf_jit_blinding_enabled(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; @@ -9073,7 +9138,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; - int i, cnt, delta = 0; + int i, ret, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || @@ -9217,6 +9282,26 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; + if (prog->jit_requested && !expect_blinding && + !bpf_map_key_poisoned(aux) && + !bpf_map_ptr_poisoned(aux) && + !bpf_map_ptr_unpriv(aux)) { + struct bpf_jit_poke_descriptor desc = { + .reason = BPF_POKE_REASON_TAIL_CALL, + .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), + .tail_call.key = bpf_map_key_immediate(aux), + }; + + ret = bpf_jit_add_poke_descriptor(prog, &desc); + if (ret < 0) { + verbose(env, "adding tail call poke descriptor failed\n"); + return ret; + } + + insn->imm = ret + 1; + continue; + } + if (!bpf_map_ptr_unpriv(aux)) continue; @@ -9231,7 +9316,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) return -EINVAL; } - map_ptr = BPF_MAP_PTR(aux->map_state); + map_ptr = BPF_MAP_PTR(aux->map_ptr_state); insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -9265,7 +9350,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; - map_ptr = BPF_MAP_PTR(aux->map_state); + map_ptr = BPF_MAP_PTR(aux->map_ptr_state); ops = map_ptr->ops; if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { @@ -9345,6 +9430,23 @@ patch_call_imm: insn->imm = fn->func - __bpf_call_base; } + /* Since poke tab is now finalized, publish aux to tracker. */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + if (!map_ptr->ops->map_poke_track || + !map_ptr->ops->map_poke_untrack || + !map_ptr->ops->map_poke_run) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux); + if (ret < 0) { + verbose(env, "tracking tail call prog failed\n"); + return ret; + } + } + return 0; } -- cgit v1.2.3 From b553a6ec570044fc1ae300c6fb24f9ce204c5894 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 24 Nov 2019 01:39:42 +0100 Subject: bpf: Simplify __bpf_arch_text_poke poke type handling Given that we have BPF_MOD_NOP_TO_{CALL,JUMP}, BPF_MOD_{CALL,JUMP}_TO_NOP and BPF_MOD_{CALL,JUMP}_TO_{CALL,JUMP} poke types and that we also pass in old_addr as well as new_addr, it's a bit redundant and unnecessarily complicates __bpf_arch_text_poke() itself since we can derive the same from the *_addr that were passed in. Hence simplify and use BPF_MOD_{CALL,JUMP} as types which also allows to clean up call-sites. In addition to that, __bpf_arch_text_poke() currently verifies that text matches expected old_insn before we invoke text_poke_bp(). Also add a check on new_insn and skip rewrite if it already matches. Reason why this is rather useful is that it avoids making any special casing in prog_array_map_poke_run() when old and new prog were NULL and has the benefit that also for this case we perform a check on text whether it really matches our expectations. Suggested-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/fcb00a2b0b288d6c73de4ef58116a821c8fe8f2f.1574555798.git.daniel@iogearbox.net --- kernel/bpf/arraymap.c | 12 +----------- kernel/bpf/trampoline.c | 8 ++++---- 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 58bdf5fd24cc..f0d19bbb9211 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -746,19 +746,9 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { - enum bpf_text_poke_type type; struct prog_poke_elem *elem; struct bpf_array_aux *aux; - if (!old && new) - type = BPF_MOD_NOP_TO_JUMP; - else if (old && !new) - type = BPF_MOD_JUMP_TO_NOP; - else if (old && new) - type = BPF_MOD_JUMP_TO_JUMP; - else - return; - aux = container_of(map, struct bpf_array, map)->aux; WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); @@ -806,7 +796,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - ret = bpf_arch_text_poke(poke->ip, type, + ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, old ? (u8 *)old->bpf_func + poke->adj_off : NULL, new ? (u8 *)new->bpf_func + diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 10ae59d65f13..7e89f1f49d77 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -77,7 +77,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) int err; if (fentry_cnt + fexit_cnt == 0) { - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_NOP, + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, old_image, NULL); tr->selector = 0; goto out; @@ -105,12 +105,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) if (tr->selector) /* progs already running at this address */ - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_CALL, + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, old_image, new_image); else /* first time registering */ - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP_TO_CALL, - NULL, new_image); + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, NULL, + new_image); if (err) goto out; tr->selector++; -- cgit v1.2.3