From 914cc63eea6fbe11ed46dba5e4438d81b0cd42d2 Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Mon, 23 Sep 2019 02:02:31 -0700 Subject: kunit: test: add KUnit test runner core Add core facilities for defining unit tests; this provides a common way to define test cases, functions that execute code which is under test and determine whether the code under test behaves as expected; this also provides a way to group together related test cases in test suites (here we call them test_modules). Just define test cases and how to execute them for now; setting expectations on code will be defined later. Signed-off-by: Brendan Higgins Reviewed-by: Greg Kroah-Hartman Reviewed-by: Logan Gunthorpe Reviewed-by: Luis Chamberlain Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- include/kunit/test.h | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 include/kunit/test.h (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h new file mode 100644 index 000000000000..e30d1bf2fb68 --- /dev/null +++ b/include/kunit/test.h @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Base unit test (KUnit) API. + * + * Copyright (C) 2019, Google LLC. + * Author: Brendan Higgins + */ + +#ifndef _KUNIT_TEST_H +#define _KUNIT_TEST_H + +#include + +struct kunit; + +/** + * struct kunit_case - represents an individual test case. + * + * @run_case: the function representing the actual test case. + * @name: the name of the test case. + * + * A test case is a function with the signature, + * ``void (*)(struct kunit *)`` that makes expectations (see + * KUNIT_EXPECT_TRUE()) about code under test. Each test case is associated + * with a &struct kunit_suite and will be run after the suite's init + * function and followed by the suite's exit function. + * + * A test case should be static and should only be created with the + * KUNIT_CASE() macro; additionally, every array of test cases should be + * terminated with an empty test case. + * + * Example: + * + * .. code-block:: c + * + * void add_test_basic(struct kunit *test) + * { + * KUNIT_EXPECT_EQ(test, 1, add(1, 0)); + * KUNIT_EXPECT_EQ(test, 2, add(1, 1)); + * KUNIT_EXPECT_EQ(test, 0, add(-1, 1)); + * KUNIT_EXPECT_EQ(test, INT_MAX, add(0, INT_MAX)); + * KUNIT_EXPECT_EQ(test, -1, add(INT_MAX, INT_MIN)); + * } + * + * static struct kunit_case example_test_cases[] = { + * KUNIT_CASE(add_test_basic), + * {} + * }; + * + */ +struct kunit_case { + void (*run_case)(struct kunit *test); + const char *name; + + /* private: internal use only. */ + bool success; +}; + +/** + * KUNIT_CASE - A helper for creating a &struct kunit_case + * + * @test_name: a reference to a test case function. + * + * Takes a symbol for a function representing a test case and creates a + * &struct kunit_case object from it. See the documentation for + * &struct kunit_case for an example on how to use it. + */ +#define KUNIT_CASE(test_name) { .run_case = test_name, .name = #test_name } + +/** + * struct kunit_suite - describes a related collection of &struct kunit_case + * + * @name: the name of the test. Purely informational. + * @init: called before every test case. + * @exit: called after every test case. + * @test_cases: a null terminated array of test cases. + * + * A kunit_suite is a collection of related &struct kunit_case s, such that + * @init is called before every test case and @exit is called after every + * test case, similar to the notion of a *test fixture* or a *test class* + * in other unit testing frameworks like JUnit or Googletest. + * + * Every &struct kunit_case must be associated with a kunit_suite for KUnit + * to run it. + */ +struct kunit_suite { + const char name[256]; + int (*init)(struct kunit *test); + void (*exit)(struct kunit *test); + struct kunit_case *test_cases; +}; + +/** + * struct kunit - represents a running instance of a test. + * + * @priv: for user to store arbitrary data. Commonly used to pass data + * created in the init function (see &struct kunit_suite). + * + * Used to store information about the current context under which the test + * is running. Most of this data is private and should only be accessed + * indirectly via public functions; the one exception is @priv which can be + * used by the test writer to store arbitrary data. + */ +struct kunit { + void *priv; + + /* private: internal use only. */ + const char *name; /* Read only after initialization! */ + /* + * success starts as true, and may only be set to false during a + * test case; thus, it is safe to update this across multiple + * threads using WRITE_ONCE; however, as a consequence, it may only + * be read after the test case finishes once all threads associated + * with the test case have terminated. + */ + bool success; /* Read only after test_case finishes! */ +}; + +void kunit_init_test(struct kunit *test, const char *name); + +int kunit_run_tests(struct kunit_suite *suite); + +/** + * kunit_test_suite() - used to register a &struct kunit_suite with KUnit. + * + * @suite: a statically allocated &struct kunit_suite. + * + * Registers @suite with the test framework. See &struct kunit_suite for + * more information. + * + * NOTE: Currently KUnit tests are all run as late_initcalls; this means + * that they cannot test anything where tests must run at a different init + * phase. One significant restriction resulting from this is that KUnit + * cannot reliably test anything that is initialize in the late_init phase; + * another is that KUnit is useless to test things that need to be run in + * an earlier init phase. + * + * TODO(brendanhiggins@google.com): Don't run all KUnit tests as + * late_initcalls. I have some future work planned to dispatch all KUnit + * tests from the same place, and at the very least to do so after + * everything else is definitely initialized. + */ +#define kunit_test_suite(suite) \ + static int kunit_suite_init##suite(void) \ + { \ + return kunit_run_tests(&suite); \ + } \ + late_initcall(kunit_suite_init##suite) + +void __printf(3, 4) kunit_printk(const char *level, + const struct kunit *test, + const char *fmt, ...); + +/** + * kunit_info() - Prints an INFO level message associated with @test. + * + * @test: The test context object. + * @fmt: A printk() style format string. + * + * Prints an info level message associated with the test suite being run. + * Takes a variable number of format parameters just like printk(). + */ +#define kunit_info(test, fmt, ...) \ + kunit_printk(KERN_INFO, test, fmt, ##__VA_ARGS__) + +/** + * kunit_warn() - Prints a WARN level message associated with @test. + * + * @test: The test context object. + * @fmt: A printk() style format string. + * + * Prints a warning level message. + */ +#define kunit_warn(test, fmt, ...) \ + kunit_printk(KERN_WARNING, test, fmt, ##__VA_ARGS__) + +/** + * kunit_err() - Prints an ERROR level message associated with @test. + * + * @test: The test context object. + * @fmt: A printk() style format string. + * + * Prints an error level message. + */ +#define kunit_err(test, fmt, ...) \ + kunit_printk(KERN_ERR, test, fmt, ##__VA_ARGS__) + +#endif /* _KUNIT_TEST_H */ -- cgit v1.2.3 From 0a756853586ce185ee1bb9ccbe5ec03e103e594f Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Mon, 23 Sep 2019 02:02:32 -0700 Subject: kunit: test: add test resource management API Create a common API for test managed resources like memory and test objects. A lot of times a test will want to set up infrastructure to be used in test cases; this could be anything from just wanting to allocate some memory to setting up a driver stack; this defines facilities for creating "test resources" which are managed by the test infrastructure and are automatically cleaned up at the conclusion of the test. Signed-off-by: Brendan Higgins Reviewed-by: Greg Kroah-Hartman Reviewed-by: Logan Gunthorpe Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- include/kunit/test.h | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index e30d1bf2fb68..6781c756f11b 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -9,8 +9,72 @@ #ifndef _KUNIT_TEST_H #define _KUNIT_TEST_H +#include #include +struct kunit_resource; + +typedef int (*kunit_resource_init_t)(struct kunit_resource *, void *); +typedef void (*kunit_resource_free_t)(struct kunit_resource *); + +/** + * struct kunit_resource - represents a *test managed resource* + * @allocation: for the user to store arbitrary data. + * @free: a user supplied function to free the resource. Populated by + * kunit_alloc_resource(). + * + * Represents a *test managed resource*, a resource which will automatically be + * cleaned up at the end of a test case. + * + * Example: + * + * .. code-block:: c + * + * struct kunit_kmalloc_params { + * size_t size; + * gfp_t gfp; + * }; + * + * static int kunit_kmalloc_init(struct kunit_resource *res, void *context) + * { + * struct kunit_kmalloc_params *params = context; + * res->allocation = kmalloc(params->size, params->gfp); + * + * if (!res->allocation) + * return -ENOMEM; + * + * return 0; + * } + * + * static void kunit_kmalloc_free(struct kunit_resource *res) + * { + * kfree(res->allocation); + * } + * + * void *kunit_kmalloc(struct kunit *test, size_t size, gfp_t gfp) + * { + * struct kunit_kmalloc_params params; + * struct kunit_resource *res; + * + * params.size = size; + * params.gfp = gfp; + * + * res = kunit_alloc_resource(test, kunit_kmalloc_init, + * kunit_kmalloc_free, ¶ms); + * if (res) + * return res->allocation; + * + * return NULL; + * } + */ +struct kunit_resource { + void *allocation; + kunit_resource_free_t free; + + /* private: internal use only. */ + struct list_head node; +}; + struct kunit; /** @@ -114,6 +178,13 @@ struct kunit { * with the test case have terminated. */ bool success; /* Read only after test_case finishes! */ + spinlock_t lock; /* Guards all mutable test state. */ + /* + * Because resources is a list that may be updated multiple times (with + * new resources) from any thread associated with a test case, we must + * protect it with some type of lock. + */ + struct list_head resources; /* Protected by lock. */ }; void kunit_init_test(struct kunit *test, const char *name); @@ -147,6 +218,122 @@ int kunit_run_tests(struct kunit_suite *suite); } \ late_initcall(kunit_suite_init##suite) +/* + * Like kunit_alloc_resource() below, but returns the struct kunit_resource + * object that contains the allocation. This is mostly for testing purposes. + */ +struct kunit_resource *kunit_alloc_and_get_resource(struct kunit *test, + kunit_resource_init_t init, + kunit_resource_free_t free, + gfp_t internal_gfp, + void *context); + +/** + * kunit_alloc_resource() - Allocates a *test managed resource*. + * @test: The test context object. + * @init: a user supplied function to initialize the resource. + * @free: a user supplied function to free the resource. + * @internal_gfp: gfp to use for internal allocations, if unsure, use GFP_KERNEL + * @context: for the user to pass in arbitrary data to the init function. + * + * Allocates a *test managed resource*, a resource which will automatically be + * cleaned up at the end of a test case. See &struct kunit_resource for an + * example. + * + * NOTE: KUnit needs to allocate memory for each kunit_resource object. You must + * specify an @internal_gfp that is compatible with the use context of your + * resource. + */ +static inline void *kunit_alloc_resource(struct kunit *test, + kunit_resource_init_t init, + kunit_resource_free_t free, + gfp_t internal_gfp, + void *context) +{ + struct kunit_resource *res; + + res = kunit_alloc_and_get_resource(test, init, free, internal_gfp, + context); + + if (res) + return res->allocation; + + return NULL; +} + +typedef bool (*kunit_resource_match_t)(struct kunit *test, + const void *res, + void *match_data); + +/** + * kunit_resource_instance_match() - Match a resource with the same instance. + * @test: Test case to which the resource belongs. + * @res: The data stored in kunit_resource->allocation. + * @match_data: The resource pointer to match against. + * + * An instance of kunit_resource_match_t that matches a resource whose + * allocation matches @match_data. + */ +static inline bool kunit_resource_instance_match(struct kunit *test, + const void *res, + void *match_data) +{ + return res == match_data; +} + +/** + * kunit_resource_destroy() - Find a kunit_resource and destroy it. + * @test: Test case to which the resource belongs. + * @match: Match function. Returns whether a given resource matches @match_data. + * @free: Must match free on the kunit_resource to free. + * @match_data: Data passed into @match. + * + * Free the latest kunit_resource of @test for which @free matches the + * kunit_resource_free_t associated with the resource and for which @match + * returns true. + * + * RETURNS: + * 0 if kunit_resource is found and freed, -ENOENT if not found. + */ +int kunit_resource_destroy(struct kunit *test, + kunit_resource_match_t match, + kunit_resource_free_t free, + void *match_data); + +/** + * kunit_kmalloc() - Like kmalloc() except the allocation is *test managed*. + * @test: The test context object. + * @size: The size in bytes of the desired memory. + * @gfp: flags passed to underlying kmalloc(). + * + * Just like `kmalloc(...)`, except the allocation is managed by the test case + * and is automatically cleaned up after the test case concludes. See &struct + * kunit_resource for more information. + */ +void *kunit_kmalloc(struct kunit *test, size_t size, gfp_t gfp); + +/** + * kunit_kfree() - Like kfree except for allocations managed by KUnit. + * @test: The test case to which the resource belongs. + * @ptr: The memory allocation to free. + */ +void kunit_kfree(struct kunit *test, const void *ptr); + +/** + * kunit_kzalloc() - Just like kunit_kmalloc(), but zeroes the allocation. + * @test: The test context object. + * @size: The size in bytes of the desired memory. + * @gfp: flags passed to underlying kmalloc(). + * + * See kzalloc() and kunit_kmalloc() for more information. + */ +static inline void *kunit_kzalloc(struct kunit *test, size_t size, gfp_t gfp) +{ + return kunit_kmalloc(test, size, gfp | __GFP_ZERO); +} + +void kunit_cleanup(struct kunit *test); + void __printf(3, 4) kunit_printk(const char *level, const struct kunit *test, const char *fmt, ...); -- cgit v1.2.3 From d1fadef194008b22d7d9608426cad29d4bb69ab9 Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Mon, 23 Sep 2019 02:02:33 -0700 Subject: kunit: test: add string_stream a std::stream like string builder A number of test features need to do pretty complicated string printing where it may not be possible to rely on a single preallocated string with parameters. So provide a library for constructing the string as you go similar to C++'s std::string. string_stream is really just a string builder, nothing more. Signed-off-by: Brendan Higgins Reviewed-by: Greg Kroah-Hartman Reviewed-by: Logan Gunthorpe Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- include/kunit/string-stream.h | 51 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 include/kunit/string-stream.h (limited to 'include') diff --git a/include/kunit/string-stream.h b/include/kunit/string-stream.h new file mode 100644 index 000000000000..fe98a00b75a9 --- /dev/null +++ b/include/kunit/string-stream.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * C++ stream style string builder used in KUnit for building messages. + * + * Copyright (C) 2019, Google LLC. + * Author: Brendan Higgins + */ + +#ifndef _KUNIT_STRING_STREAM_H +#define _KUNIT_STRING_STREAM_H + +#include +#include +#include + +struct string_stream_fragment { + struct kunit *test; + struct list_head node; + char *fragment; +}; + +struct string_stream { + size_t length; + struct list_head fragments; + /* length and fragments are protected by this lock */ + spinlock_t lock; + struct kunit *test; + gfp_t gfp; +}; + +struct kunit; + +struct string_stream *alloc_string_stream(struct kunit *test, gfp_t gfp); + +int __printf(2, 3) string_stream_add(struct string_stream *stream, + const char *fmt, ...); + +int string_stream_vadd(struct string_stream *stream, + const char *fmt, + va_list args); + +char *string_stream_get_string(struct string_stream *stream); + +int string_stream_append(struct string_stream *stream, + struct string_stream *other); + +bool string_stream_is_empty(struct string_stream *stream); + +int string_stream_destroy(struct string_stream *stream); + +#endif /* _KUNIT_STRING_STREAM_H */ -- cgit v1.2.3 From 6b229e593ff9ac2fd8d83c3654e0f39fa1d49a67 Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Mon, 23 Sep 2019 02:02:34 -0700 Subject: kunit: test: add assertion printing library Add `struct kunit_assert` and friends which provide a structured way to capture data from an expectation or an assertion (introduced later in the series) so that it may be printed out in the event of a failure. Signed-off-by: Brendan Higgins Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- include/kunit/assert.h | 356 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 include/kunit/assert.h (limited to 'include') diff --git a/include/kunit/assert.h b/include/kunit/assert.h new file mode 100644 index 000000000000..db6a0fca09b4 --- /dev/null +++ b/include/kunit/assert.h @@ -0,0 +1,356 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Assertion and expectation serialization API. + * + * Copyright (C) 2019, Google LLC. + * Author: Brendan Higgins + */ + +#ifndef _KUNIT_ASSERT_H +#define _KUNIT_ASSERT_H + +#include +#include + +struct kunit; + +/** + * enum kunit_assert_type - Type of expectation/assertion. + * @KUNIT_ASSERTION: Used to denote that a kunit_assert represents an assertion. + * @KUNIT_EXPECTATION: Denotes that a kunit_assert represents an expectation. + * + * Used in conjunction with a &struct kunit_assert to denote whether it + * represents an expectation or an assertion. + */ +enum kunit_assert_type { + KUNIT_ASSERTION, + KUNIT_EXPECTATION, +}; + +/** + * struct kunit_assert - Data for printing a failed assertion or expectation. + * @test: the test case this expectation/assertion is associated with. + * @type: the type (either an expectation or an assertion) of this kunit_assert. + * @line: the source code line number that the expectation/assertion is at. + * @file: the file path of the source file that the expectation/assertion is in. + * @message: an optional message to provide additional context. + * @format: a function which formats the data in this kunit_assert to a string. + * + * Represents a failed expectation/assertion. Contains all the data necessary to + * format a string to a user reporting the failure. + */ +struct kunit_assert { + struct kunit *test; + enum kunit_assert_type type; + int line; + const char *file; + struct va_format message; + void (*format)(const struct kunit_assert *assert, + struct string_stream *stream); +}; + +/** + * KUNIT_INIT_VA_FMT_NULL - Default initializer for struct va_format. + * + * Used inside a struct initialization block to initialize struct va_format to + * default values where fmt and va are null. + */ +#define KUNIT_INIT_VA_FMT_NULL { .fmt = NULL, .va = NULL } + +/** + * KUNIT_INIT_ASSERT_STRUCT() - Initializer for a &struct kunit_assert. + * @kunit: The test case that this expectation/assertion is associated with. + * @assert_type: The type (assertion or expectation) of this kunit_assert. + * @fmt: The formatting function which builds a string out of this kunit_assert. + * + * The base initializer for a &struct kunit_assert. + */ +#define KUNIT_INIT_ASSERT_STRUCT(kunit, assert_type, fmt) { \ + .test = kunit, \ + .type = assert_type, \ + .file = __FILE__, \ + .line = __LINE__, \ + .message = KUNIT_INIT_VA_FMT_NULL, \ + .format = fmt \ +} + +void kunit_base_assert_format(const struct kunit_assert *assert, + struct string_stream *stream); + +void kunit_assert_print_msg(const struct kunit_assert *assert, + struct string_stream *stream); + +/** + * struct kunit_fail_assert - Represents a plain fail expectation/assertion. + * @assert: The parent of this type. + * + * Represents a simple KUNIT_FAIL/KUNIT_ASSERT_FAILURE that always fails. + */ +struct kunit_fail_assert { + struct kunit_assert assert; +}; + +void kunit_fail_assert_format(const struct kunit_assert *assert, + struct string_stream *stream); + +/** + * KUNIT_INIT_FAIL_ASSERT_STRUCT() - Initializer for &struct kunit_fail_assert. + * @test: The test case that this expectation/assertion is associated with. + * @type: The type (assertion or expectation) of this kunit_assert. + * + * Initializes a &struct kunit_fail_assert. Intended to be used in + * KUNIT_EXPECT_* and KUNIT_ASSERT_* macros. + */ +#define KUNIT_INIT_FAIL_ASSERT_STRUCT(test, type) { \ + .assert = KUNIT_INIT_ASSERT_STRUCT(test, \ + type, \ + kunit_fail_assert_format) \ +} + +/** + * struct kunit_unary_assert - Represents a KUNIT_{EXPECT|ASSERT}_{TRUE|FALSE} + * @assert: The parent of this type. + * @condition: A string representation of a conditional expression. + * @expected_true: True if of type KUNIT_{EXPECT|ASSERT}_TRUE, false otherwise. + * + * Represents a simple expectation or assertion that simply asserts something is + * true or false. In other words, represents the expectations: + * KUNIT_{EXPECT|ASSERT}_{TRUE|FALSE} + */ +struct kunit_unary_assert { + struct kunit_assert assert; + const char *condition; + bool expected_true; +}; + +void kunit_unary_assert_format(const struct kunit_assert *assert, + struct string_stream *stream); + +/** + * KUNIT_INIT_UNARY_ASSERT_STRUCT() - Initializes &struct kunit_unary_assert. + * @test: The test case that this expectation/assertion is associated with. + * @type: The type (assertion or expectation) of this kunit_assert. + * @cond: A string representation of the expression asserted true or false. + * @expect_true: True if of type KUNIT_{EXPECT|ASSERT}_TRUE, false otherwise. + * + * Initializes a &struct kunit_unary_assert. Intended to be used in + * KUNIT_EXPECT_* and KUNIT_ASSERT_* macros. + */ +#define KUNIT_INIT_UNARY_ASSERT_STRUCT(test, type, cond, expect_true) { \ + .assert = KUNIT_INIT_ASSERT_STRUCT(test, \ + type, \ + kunit_unary_assert_format), \ + .condition = cond, \ + .expected_true = expect_true \ +} + +/** + * struct kunit_ptr_not_err_assert - An expectation/assertion that a pointer is + * not NULL and not a -errno. + * @assert: The parent of this type. + * @text: A string representation of the expression passed to the expectation. + * @value: The actual evaluated pointer value of the expression. + * + * Represents an expectation/assertion that a pointer is not null and is does + * not contain a -errno. (See IS_ERR_OR_NULL().) + */ +struct kunit_ptr_not_err_assert { + struct kunit_assert assert; + const char *text; + const void *value; +}; + +void kunit_ptr_not_err_assert_format(const struct kunit_assert *assert, + struct string_stream *stream); + +/** + * KUNIT_INIT_PTR_NOT_ERR_ASSERT_STRUCT() - Initializes a + * &struct kunit_ptr_not_err_assert. + * @test: The test case that this expectation/assertion is associated with. + * @type: The type (assertion or expectation) of this kunit_assert. + * @txt: A string representation of the expression passed to the expectation. + * @val: The actual evaluated pointer value of the expression. + * + * Initializes a &struct kunit_ptr_not_err_assert. Intended to be used in + * KUNIT_EXPECT_* and KUNIT_ASSERT_* macros. + */ +#define KUNIT_INIT_PTR_NOT_ERR_STRUCT(test, type, txt, val) { \ + .assert = KUNIT_INIT_ASSERT_STRUCT(test, \ + type, \ + kunit_ptr_not_err_assert_format), \ + .text = txt, \ + .value = val \ +} + +/** + * struct kunit_binary_assert - An expectation/assertion that compares two + * non-pointer values (for example, KUNIT_EXPECT_EQ(test, 1 + 1, 2)). + * @assert: The parent of this type. + * @operation: A string representation of the comparison operator (e.g. "=="). + * @left_text: A string representation of the expression in the left slot. + * @left_value: The actual evaluated value of the expression in the left slot. + * @right_text: A string representation of the expression in the right slot. + * @right_value: The actual evaluated value of the expression in the right slot. + * + * Represents an expectation/assertion that compares two non-pointer values. For + * example, to expect that 1 + 1 == 2, you can use the expectation + * KUNIT_EXPECT_EQ(test, 1 + 1, 2); + */ +struct kunit_binary_assert { + struct kunit_assert assert; + const char *operation; + const char *left_text; + long long left_value; + const char *right_text; + long long right_value; +}; + +void kunit_binary_assert_format(const struct kunit_assert *assert, + struct string_stream *stream); + +/** + * KUNIT_INIT_BINARY_ASSERT_STRUCT() - Initializes a + * &struct kunit_binary_assert. + * @test: The test case that this expectation/assertion is associated with. + * @type: The type (assertion or expectation) of this kunit_assert. + * @op_str: A string representation of the comparison operator (e.g. "=="). + * @left_str: A string representation of the expression in the left slot. + * @left_val: The actual evaluated value of the expression in the left slot. + * @right_str: A string representation of the expression in the right slot. + * @right_val: The actual evaluated value of the expression in the right slot. + * + * Initializes a &struct kunit_binary_assert. Intended to be used in + * KUNIT_EXPECT_* and KUNIT_ASSERT_* macros. + */ +#define KUNIT_INIT_BINARY_ASSERT_STRUCT(test, \ + type, \ + op_str, \ + left_str, \ + left_val, \ + right_str, \ + right_val) { \ + .assert = KUNIT_INIT_ASSERT_STRUCT(test, \ + type, \ + kunit_binary_assert_format), \ + .operation = op_str, \ + .left_text = left_str, \ + .left_value = left_val, \ + .right_text = right_str, \ + .right_value = right_val \ +} + +/** + * struct kunit_binary_ptr_assert - An expectation/assertion that compares two + * pointer values (for example, KUNIT_EXPECT_PTR_EQ(test, foo, bar)). + * @assert: The parent of this type. + * @operation: A string representation of the comparison operator (e.g. "=="). + * @left_text: A string representation of the expression in the left slot. + * @left_value: The actual evaluated value of the expression in the left slot. + * @right_text: A string representation of the expression in the right slot. + * @right_value: The actual evaluated value of the expression in the right slot. + * + * Represents an expectation/assertion that compares two pointer values. For + * example, to expect that foo and bar point to the same thing, you can use the + * expectation KUNIT_EXPECT_PTR_EQ(test, foo, bar); + */ +struct kunit_binary_ptr_assert { + struct kunit_assert assert; + const char *operation; + const char *left_text; + const void *left_value; + const char *right_text; + const void *right_value; +}; + +void kunit_binary_ptr_assert_format(const struct kunit_assert *assert, + struct string_stream *stream); + +/** + * KUNIT_INIT_BINARY_PTR_ASSERT_STRUCT() - Initializes a + * &struct kunit_binary_ptr_assert. + * @test: The test case that this expectation/assertion is associated with. + * @type: The type (assertion or expectation) of this kunit_assert. + * @op_str: A string representation of the comparison operator (e.g. "=="). + * @left_str: A string representation of the expression in the left slot. + * @left_val: The actual evaluated value of the expression in the left slot. + * @right_str: A string representation of the expression in the right slot. + * @right_val: The actual evaluated value of the expression in the right slot. + * + * Initializes a &struct kunit_binary_ptr_assert. Intended to be used in + * KUNIT_EXPECT_* and KUNIT_ASSERT_* macros. + */ +#define KUNIT_INIT_BINARY_PTR_ASSERT_STRUCT(test, \ + type, \ + op_str, \ + left_str, \ + left_val, \ + right_str, \ + right_val) { \ + .assert = KUNIT_INIT_ASSERT_STRUCT(test, \ + type, \ + kunit_binary_ptr_assert_format), \ + .operation = op_str, \ + .left_text = left_str, \ + .left_value = left_val, \ + .right_text = right_str, \ + .right_value = right_val \ +} + +/** + * struct kunit_binary_str_assert - An expectation/assertion that compares two + * string values (for example, KUNIT_EXPECT_STREQ(test, foo, "bar")). + * @assert: The parent of this type. + * @operation: A string representation of the comparison operator (e.g. "=="). + * @left_text: A string representation of the expression in the left slot. + * @left_value: The actual evaluated value of the expression in the left slot. + * @right_text: A string representation of the expression in the right slot. + * @right_value: The actual evaluated value of the expression in the right slot. + * + * Represents an expectation/assertion that compares two string values. For + * example, to expect that the string in foo is equal to "bar", you can use the + * expectation KUNIT_EXPECT_STREQ(test, foo, "bar"); + */ +struct kunit_binary_str_assert { + struct kunit_assert assert; + const char *operation; + const char *left_text; + const char *left_value; + const char *right_text; + const char *right_value; +}; + +void kunit_binary_str_assert_format(const struct kunit_assert *assert, + struct string_stream *stream); + +/** + * KUNIT_INIT_BINARY_STR_ASSERT_STRUCT() - Initializes a + * &struct kunit_binary_str_assert. + * @test: The test case that this expectation/assertion is associated with. + * @type: The type (assertion or expectation) of this kunit_assert. + * @op_str: A string representation of the comparison operator (e.g. "=="). + * @left_str: A string representation of the expression in the left slot. + * @left_val: The actual evaluated value of the expression in the left slot. + * @right_str: A string representation of the expression in the right slot. + * @right_val: The actual evaluated value of the expression in the right slot. + * + * Initializes a &struct kunit_binary_str_assert. Intended to be used in + * KUNIT_EXPECT_* and KUNIT_ASSERT_* macros. + */ +#define KUNIT_INIT_BINARY_STR_ASSERT_STRUCT(test, \ + type, \ + op_str, \ + left_str, \ + left_val, \ + right_str, \ + right_val) { \ + .assert = KUNIT_INIT_ASSERT_STRUCT(test, \ + type, \ + kunit_binary_str_assert_format), \ + .operation = op_str, \ + .left_text = left_str, \ + .left_value = left_val, \ + .right_text = right_str, \ + .right_value = right_val \ +} + +#endif /* _KUNIT_ASSERT_H */ -- cgit v1.2.3 From 73cda7bb8bfb1d4be0325d76172950ede1a65fd0 Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Mon, 23 Sep 2019 02:02:35 -0700 Subject: kunit: test: add the concept of expectations Add support for expectations, which allow properties to be specified and then verified in tests. Signed-off-by: Brendan Higgins Reviewed-by: Greg Kroah-Hartman Reviewed-by: Logan Gunthorpe Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- include/kunit/test.h | 836 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 836 insertions(+) (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index 6781c756f11b..30a62de16bc9 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -9,6 +9,8 @@ #ifndef _KUNIT_TEST_H #define _KUNIT_TEST_H +#include +#include #include #include @@ -372,4 +374,838 @@ void __printf(3, 4) kunit_printk(const char *level, #define kunit_err(test, fmt, ...) \ kunit_printk(KERN_ERR, test, fmt, ##__VA_ARGS__) +/** + * KUNIT_SUCCEED() - A no-op expectation. Only exists for code clarity. + * @test: The test context object. + * + * The opposite of KUNIT_FAIL(), it is an expectation that cannot fail. In other + * words, it does nothing and only exists for code clarity. See + * KUNIT_EXPECT_TRUE() for more information. + */ +#define KUNIT_SUCCEED(test) do {} while (0) + +void kunit_do_assertion(struct kunit *test, + struct kunit_assert *assert, + bool pass, + const char *fmt, ...); + +#define KUNIT_ASSERTION(test, pass, assert_class, INITIALIZER, fmt, ...) do { \ + struct assert_class __assertion = INITIALIZER; \ + kunit_do_assertion(test, \ + &__assertion.assert, \ + pass, \ + fmt, \ + ##__VA_ARGS__); \ +} while (0) + + +#define KUNIT_FAIL_ASSERTION(test, assert_type, fmt, ...) \ + KUNIT_ASSERTION(test, \ + false, \ + kunit_fail_assert, \ + KUNIT_INIT_FAIL_ASSERT_STRUCT(test, assert_type), \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_FAIL() - Always causes a test to fail when evaluated. + * @test: The test context object. + * @fmt: an informational message to be printed when the assertion is made. + * @...: string format arguments. + * + * The opposite of KUNIT_SUCCEED(), it is an expectation that always fails. In + * other words, it always results in a failed expectation, and consequently + * always causes the test case to fail when evaluated. See KUNIT_EXPECT_TRUE() + * for more information. + */ +#define KUNIT_FAIL(test, fmt, ...) \ + KUNIT_FAIL_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_UNARY_ASSERTION(test, \ + assert_type, \ + condition, \ + expected_true, \ + fmt, \ + ...) \ + KUNIT_ASSERTION(test, \ + !!(condition) == !!expected_true, \ + kunit_unary_assert, \ + KUNIT_INIT_UNARY_ASSERT_STRUCT(test, \ + assert_type, \ + #condition, \ + expected_true), \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_TRUE_MSG_ASSERTION(test, assert_type, condition, fmt, ...) \ + KUNIT_UNARY_ASSERTION(test, \ + assert_type, \ + condition, \ + true, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_TRUE_ASSERTION(test, assert_type, condition) \ + KUNIT_TRUE_MSG_ASSERTION(test, assert_type, condition, NULL) + +#define KUNIT_FALSE_MSG_ASSERTION(test, assert_type, condition, fmt, ...) \ + KUNIT_UNARY_ASSERTION(test, \ + assert_type, \ + condition, \ + false, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_FALSE_ASSERTION(test, assert_type, condition) \ + KUNIT_FALSE_MSG_ASSERTION(test, assert_type, condition, NULL) + +/* + * A factory macro for defining the assertions and expectations for the basic + * comparisons defined for the built in types. + * + * Unfortunately, there is no common type that all types can be promoted to for + * which all the binary operators behave the same way as for the actual types + * (for example, there is no type that long long and unsigned long long can + * both be cast to where the comparison result is preserved for all values). So + * the best we can do is do the comparison in the original types and then coerce + * everything to long long for printing; this way, the comparison behaves + * correctly and the printed out value usually makes sense without + * interpretation, but can always be interpreted to figure out the actual + * value. + */ +#define KUNIT_BASE_BINARY_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, \ + op, \ + right, \ + fmt, \ + ...) \ +do { \ + typeof(left) __left = (left); \ + typeof(right) __right = (right); \ + ((void)__typecheck(__left, __right)); \ + \ + KUNIT_ASSERTION(test, \ + __left op __right, \ + assert_class, \ + ASSERT_CLASS_INIT(test, \ + assert_type, \ + #op, \ + #left, \ + __left, \ + #right, \ + __right), \ + fmt, \ + ##__VA_ARGS__); \ +} while (0) + +#define KUNIT_BASE_EQ_MSG_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_BINARY_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, ==, right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BASE_NE_MSG_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_BINARY_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, !=, right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BASE_LT_MSG_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_BINARY_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, <, right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BASE_LE_MSG_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_BINARY_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, <=, right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BASE_GT_MSG_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_BINARY_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, >, right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BASE_GE_MSG_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_BINARY_ASSERTION(test, \ + assert_class, \ + ASSERT_CLASS_INIT, \ + assert_type, \ + left, >=, right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_EQ_MSG_ASSERTION(test, assert_type, left, right, fmt, ...)\ + KUNIT_BASE_EQ_MSG_ASSERTION(test, \ + kunit_binary_assert, \ + KUNIT_INIT_BINARY_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_EQ_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_EQ_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_PTR_EQ_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_EQ_MSG_ASSERTION(test, \ + kunit_binary_ptr_assert, \ + KUNIT_INIT_BINARY_PTR_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_PTR_EQ_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_PTR_EQ_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_NE_MSG_ASSERTION(test, assert_type, left, right, fmt, ...)\ + KUNIT_BASE_NE_MSG_ASSERTION(test, \ + kunit_binary_assert, \ + KUNIT_INIT_BINARY_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_NE_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_NE_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_PTR_NE_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_NE_MSG_ASSERTION(test, \ + kunit_binary_ptr_assert, \ + KUNIT_INIT_BINARY_PTR_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_PTR_NE_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_PTR_NE_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_LT_MSG_ASSERTION(test, assert_type, left, right, fmt, ...)\ + KUNIT_BASE_LT_MSG_ASSERTION(test, \ + kunit_binary_assert, \ + KUNIT_INIT_BINARY_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_LT_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_LT_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_PTR_LT_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_LT_MSG_ASSERTION(test, \ + kunit_binary_ptr_assert, \ + KUNIT_INIT_BINARY_PTR_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_PTR_LT_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_PTR_LT_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_LE_MSG_ASSERTION(test, assert_type, left, right, fmt, ...)\ + KUNIT_BASE_LE_MSG_ASSERTION(test, \ + kunit_binary_assert, \ + KUNIT_INIT_BINARY_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_LE_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_LE_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_PTR_LE_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_LE_MSG_ASSERTION(test, \ + kunit_binary_ptr_assert, \ + KUNIT_INIT_BINARY_PTR_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_PTR_LE_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_PTR_LE_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_GT_MSG_ASSERTION(test, assert_type, left, right, fmt, ...)\ + KUNIT_BASE_GT_MSG_ASSERTION(test, \ + kunit_binary_assert, \ + KUNIT_INIT_BINARY_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_GT_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_GT_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_PTR_GT_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_GT_MSG_ASSERTION(test, \ + kunit_binary_ptr_assert, \ + KUNIT_INIT_BINARY_PTR_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_PTR_GT_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_PTR_GT_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_GE_MSG_ASSERTION(test, assert_type, left, right, fmt, ...)\ + KUNIT_BASE_GE_MSG_ASSERTION(test, \ + kunit_binary_assert, \ + KUNIT_INIT_BINARY_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_GE_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_GE_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_PTR_GE_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BASE_GE_MSG_ASSERTION(test, \ + kunit_binary_ptr_assert, \ + KUNIT_INIT_BINARY_PTR_ASSERT_STRUCT, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_PTR_GE_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_PTR_GE_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_STR_ASSERTION(test, \ + assert_type, \ + left, \ + op, \ + right, \ + fmt, \ + ...) \ +do { \ + typeof(left) __left = (left); \ + typeof(right) __right = (right); \ + \ + KUNIT_ASSERTION(test, \ + strcmp(__left, __right) op 0, \ + kunit_binary_str_assert, \ + KUNIT_INIT_BINARY_ASSERT_STRUCT(test, \ + assert_type, \ + #op, \ + #left, \ + __left, \ + #right, \ + __right), \ + fmt, \ + ##__VA_ARGS__); \ +} while (0) + +#define KUNIT_BINARY_STR_EQ_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BINARY_STR_ASSERTION(test, \ + assert_type, \ + left, ==, right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_STR_EQ_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_STR_EQ_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_BINARY_STR_NE_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + fmt, \ + ...) \ + KUNIT_BINARY_STR_ASSERTION(test, \ + assert_type, \ + left, !=, right, \ + fmt, \ + ##__VA_ARGS__) + +#define KUNIT_BINARY_STR_NE_ASSERTION(test, assert_type, left, right) \ + KUNIT_BINARY_STR_NE_MSG_ASSERTION(test, \ + assert_type, \ + left, \ + right, \ + NULL) + +#define KUNIT_PTR_NOT_ERR_OR_NULL_MSG_ASSERTION(test, \ + assert_type, \ + ptr, \ + fmt, \ + ...) \ +do { \ + typeof(ptr) __ptr = (ptr); \ + \ + KUNIT_ASSERTION(test, \ + !IS_ERR_OR_NULL(__ptr), \ + kunit_ptr_not_err_assert, \ + KUNIT_INIT_PTR_NOT_ERR_STRUCT(test, \ + assert_type, \ + #ptr, \ + __ptr), \ + fmt, \ + ##__VA_ARGS__); \ +} while (0) + +#define KUNIT_PTR_NOT_ERR_OR_NULL_ASSERTION(test, assert_type, ptr) \ + KUNIT_PTR_NOT_ERR_OR_NULL_MSG_ASSERTION(test, \ + assert_type, \ + ptr, \ + NULL) + +/** + * KUNIT_EXPECT_TRUE() - Causes a test failure when the expression is not true. + * @test: The test context object. + * @condition: an arbitrary boolean expression. The test fails when this does + * not evaluate to true. + * + * This and expectations of the form `KUNIT_EXPECT_*` will cause the test case + * to fail when the specified condition is not met; however, it will not prevent + * the test case from continuing to run; this is otherwise known as an + * *expectation failure*. + */ +#define KUNIT_EXPECT_TRUE(test, condition) \ + KUNIT_TRUE_ASSERTION(test, KUNIT_EXPECTATION, condition) + +#define KUNIT_EXPECT_TRUE_MSG(test, condition, fmt, ...) \ + KUNIT_TRUE_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + condition, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_FALSE() - Makes a test failure when the expression is not false. + * @test: The test context object. + * @condition: an arbitrary boolean expression. The test fails when this does + * not evaluate to false. + * + * Sets an expectation that @condition evaluates to false. See + * KUNIT_EXPECT_TRUE() for more information. + */ +#define KUNIT_EXPECT_FALSE(test, condition) \ + KUNIT_FALSE_ASSERTION(test, KUNIT_EXPECTATION, condition) + +#define KUNIT_EXPECT_FALSE_MSG(test, condition, fmt, ...) \ + KUNIT_FALSE_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + condition, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_EQ() - Sets an expectation that @left and @right are equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an expectation that the values that @left and @right evaluate to are + * equal. This is semantically equivalent to + * KUNIT_EXPECT_TRUE(@test, (@left) == (@right)). See KUNIT_EXPECT_TRUE() for + * more information. + */ +#define KUNIT_EXPECT_EQ(test, left, right) \ + KUNIT_BINARY_EQ_ASSERTION(test, KUNIT_EXPECTATION, left, right) + +#define KUNIT_EXPECT_EQ_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_EQ_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_PTR_EQ() - Expects that pointers @left and @right are equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a pointer. + * @right: an arbitrary expression that evaluates to a pointer. + * + * Sets an expectation that the values that @left and @right evaluate to are + * equal. This is semantically equivalent to + * KUNIT_EXPECT_TRUE(@test, (@left) == (@right)). See KUNIT_EXPECT_TRUE() for + * more information. + */ +#define KUNIT_EXPECT_PTR_EQ(test, left, right) \ + KUNIT_BINARY_PTR_EQ_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right) + +#define KUNIT_EXPECT_PTR_EQ_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_PTR_EQ_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_NE() - An expectation that @left and @right are not equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an expectation that the values that @left and @right evaluate to are not + * equal. This is semantically equivalent to + * KUNIT_EXPECT_TRUE(@test, (@left) != (@right)). See KUNIT_EXPECT_TRUE() for + * more information. + */ +#define KUNIT_EXPECT_NE(test, left, right) \ + KUNIT_BINARY_NE_ASSERTION(test, KUNIT_EXPECTATION, left, right) + +#define KUNIT_EXPECT_NE_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_NE_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_PTR_NE() - Expects that pointers @left and @right are not equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a pointer. + * @right: an arbitrary expression that evaluates to a pointer. + * + * Sets an expectation that the values that @left and @right evaluate to are not + * equal. This is semantically equivalent to + * KUNIT_EXPECT_TRUE(@test, (@left) != (@right)). See KUNIT_EXPECT_TRUE() for + * more information. + */ +#define KUNIT_EXPECT_PTR_NE(test, left, right) \ + KUNIT_BINARY_PTR_NE_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right) + +#define KUNIT_EXPECT_PTR_NE_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_PTR_NE_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_LT() - An expectation that @left is less than @right. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an expectation that the value that @left evaluates to is less than the + * value that @right evaluates to. This is semantically equivalent to + * KUNIT_EXPECT_TRUE(@test, (@left) < (@right)). See KUNIT_EXPECT_TRUE() for + * more information. + */ +#define KUNIT_EXPECT_LT(test, left, right) \ + KUNIT_BINARY_LT_ASSERTION(test, KUNIT_EXPECTATION, left, right) + +#define KUNIT_EXPECT_LT_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_LT_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_LE() - Expects that @left is less than or equal to @right. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an expectation that the value that @left evaluates to is less than or + * equal to the value that @right evaluates to. Semantically this is equivalent + * to KUNIT_EXPECT_TRUE(@test, (@left) <= (@right)). See KUNIT_EXPECT_TRUE() for + * more information. + */ +#define KUNIT_EXPECT_LE(test, left, right) \ + KUNIT_BINARY_LE_ASSERTION(test, KUNIT_EXPECTATION, left, right) + +#define KUNIT_EXPECT_LE_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_LE_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_GT() - An expectation that @left is greater than @right. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an expectation that the value that @left evaluates to is greater than + * the value that @right evaluates to. This is semantically equivalent to + * KUNIT_EXPECT_TRUE(@test, (@left) > (@right)). See KUNIT_EXPECT_TRUE() for + * more information. + */ +#define KUNIT_EXPECT_GT(test, left, right) \ + KUNIT_BINARY_GT_ASSERTION(test, KUNIT_EXPECTATION, left, right) + +#define KUNIT_EXPECT_GT_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_GT_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_GE() - Expects that @left is greater than or equal to @right. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an expectation that the value that @left evaluates to is greater than + * the value that @right evaluates to. This is semantically equivalent to + * KUNIT_EXPECT_TRUE(@test, (@left) >= (@right)). See KUNIT_EXPECT_TRUE() for + * more information. + */ +#define KUNIT_EXPECT_GE(test, left, right) \ + KUNIT_BINARY_GE_ASSERTION(test, KUNIT_EXPECTATION, left, right) + +#define KUNIT_EXPECT_GE_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_GE_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_STREQ() - Expects that strings @left and @right are equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a null terminated string. + * @right: an arbitrary expression that evaluates to a null terminated string. + * + * Sets an expectation that the values that @left and @right evaluate to are + * equal. This is semantically equivalent to + * KUNIT_EXPECT_TRUE(@test, !strcmp((@left), (@right))). See KUNIT_EXPECT_TRUE() + * for more information. + */ +#define KUNIT_EXPECT_STREQ(test, left, right) \ + KUNIT_BINARY_STR_EQ_ASSERTION(test, KUNIT_EXPECTATION, left, right) + +#define KUNIT_EXPECT_STREQ_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_STR_EQ_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_STRNEQ() - Expects that strings @left and @right are not equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a null terminated string. + * @right: an arbitrary expression that evaluates to a null terminated string. + * + * Sets an expectation that the values that @left and @right evaluate to are + * not equal. This is semantically equivalent to + * KUNIT_EXPECT_TRUE(@test, strcmp((@left), (@right))). See KUNIT_EXPECT_TRUE() + * for more information. + */ +#define KUNIT_EXPECT_STRNEQ(test, left, right) \ + KUNIT_BINARY_STR_NE_ASSERTION(test, KUNIT_EXPECTATION, left, right) + +#define KUNIT_EXPECT_STRNEQ_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_STR_NE_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_EXPECT_NOT_ERR_OR_NULL() - Expects that @ptr is not null and not err. + * @test: The test context object. + * @ptr: an arbitrary pointer. + * + * Sets an expectation that the value that @ptr evaluates to is not null and not + * an errno stored in a pointer. This is semantically equivalent to + * KUNIT_EXPECT_TRUE(@test, !IS_ERR_OR_NULL(@ptr)). See KUNIT_EXPECT_TRUE() for + * more information. + */ +#define KUNIT_EXPECT_NOT_ERR_OR_NULL(test, ptr) \ + KUNIT_PTR_NOT_ERR_OR_NULL_ASSERTION(test, KUNIT_EXPECTATION, ptr) + +#define KUNIT_EXPECT_NOT_ERR_OR_NULL_MSG(test, ptr, fmt, ...) \ + KUNIT_PTR_NOT_ERR_OR_NULL_MSG_ASSERTION(test, \ + KUNIT_EXPECTATION, \ + ptr, \ + fmt, \ + ##__VA_ARGS__) + #endif /* _KUNIT_TEST_H */ -- cgit v1.2.3 From 5f3e06208920ee78b68cf3527f40ffbff83cb3bc Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Mon, 23 Sep 2019 02:02:39 -0700 Subject: kunit: test: add support for test abort Add support for aborting/bailing out of test cases, which is needed for implementing assertions. An assertion is like an expectation, but bails out of the test case early if the assertion is not met. The idea with assertions is that you use them to state all the preconditions for your test. Logically speaking, these are the premises of the test case, so if a premise isn't true, there is no point in continuing the test case because there are no conclusions that can be drawn without the premises. Whereas, the expectation is the thing you are trying to prove. Signed-off-by: Brendan Higgins Reviewed-by: Greg Kroah-Hartman Reviewed-by: Logan Gunthorpe Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- include/kunit/test.h | 2 ++ include/kunit/try-catch.h | 75 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 include/kunit/try-catch.h (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index 30a62de16bc9..3d554d7c1c79 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -10,6 +10,7 @@ #define _KUNIT_TEST_H #include +#include #include #include #include @@ -172,6 +173,7 @@ struct kunit { /* private: internal use only. */ const char *name; /* Read only after initialization! */ + struct kunit_try_catch try_catch; /* * success starts as true, and may only be set to false during a * test case; thus, it is safe to update this across multiple diff --git a/include/kunit/try-catch.h b/include/kunit/try-catch.h new file mode 100644 index 000000000000..404f336cbdc8 --- /dev/null +++ b/include/kunit/try-catch.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * An API to allow a function, that may fail, to be executed, and recover in a + * controlled manner. + * + * Copyright (C) 2019, Google LLC. + * Author: Brendan Higgins + */ + +#ifndef _KUNIT_TRY_CATCH_H +#define _KUNIT_TRY_CATCH_H + +#include + +typedef void (*kunit_try_catch_func_t)(void *); + +struct completion; +struct kunit; + +/** + * struct kunit_try_catch - provides a generic way to run code which might fail. + * @test: The test case that is currently being executed. + * @try_completion: Completion that the control thread waits on while test runs. + * @try_result: Contains any errno obtained while running test case. + * @try: The function, the test case, to attempt to run. + * @catch: The function called if @try bails out. + * @context: used to pass user data to the try and catch functions. + * + * kunit_try_catch provides a generic, architecture independent way to execute + * an arbitrary function of type kunit_try_catch_func_t which may bail out by + * calling kunit_try_catch_throw(). If kunit_try_catch_throw() is called, @try + * is stopped at the site of invocation and @catch is called. + * + * struct kunit_try_catch provides a generic interface for the functionality + * needed to implement kunit->abort() which in turn is needed for implementing + * assertions. Assertions allow stating a precondition for a test simplifying + * how test cases are written and presented. + * + * Assertions are like expectations, except they abort (call + * kunit_try_catch_throw()) when the specified condition is not met. This is + * useful when you look at a test case as a logical statement about some piece + * of code, where assertions are the premises for the test case, and the + * conclusion is a set of predicates, rather expectations, that must all be + * true. If your premises are violated, it does not makes sense to continue. + */ +struct kunit_try_catch { + /* private: internal use only. */ + struct kunit *test; + struct completion *try_completion; + int try_result; + kunit_try_catch_func_t try; + kunit_try_catch_func_t catch; + void *context; +}; + +void kunit_try_catch_init(struct kunit_try_catch *try_catch, + struct kunit *test, + kunit_try_catch_func_t try, + kunit_try_catch_func_t catch); + +void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context); + +void __noreturn kunit_try_catch_throw(struct kunit_try_catch *try_catch); + +static inline int kunit_try_catch_get_result(struct kunit_try_catch *try_catch) +{ + return try_catch->try_result; +} + +/* + * Exposed for testing only. + */ +void kunit_generic_try_catch_init(struct kunit_try_catch *try_catch); + +#endif /* _KUNIT_TRY_CATCH_H */ -- cgit v1.2.3 From e4aea8f8532b55f966d828efcf720f8aeb203e13 Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Mon, 23 Sep 2019 02:02:41 -0700 Subject: kunit: test: add the concept of assertions Add support for assertions which are like expectations except the test terminates if the assertion is not satisfied. The idea with assertions is that you use them to state all the preconditions for your test. Logically speaking, these are the premises of the test case, so if a premise isn't true, there is no point in continuing the test case because there are no conclusions that can be drawn without the premises. Whereas, the expectation is the thing you are trying to prove. It is not used universally in x-unit style test frameworks, but I really like it as a convention. You could still express the idea of a premise using the above idiom, but I think KUNIT_ASSERT_* states the intended idea perfectly. Signed-off-by: Brendan Higgins Reviewed-by: Greg Kroah-Hartman Reviewed-by: Logan Gunthorpe Reviewed-by: Stephen Boyd Signed-off-by: Shuah Khan --- include/kunit/test.h | 282 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 280 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index 3d554d7c1c79..8b7eb03d4971 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -87,8 +87,9 @@ struct kunit; * @name: the name of the test case. * * A test case is a function with the signature, - * ``void (*)(struct kunit *)`` that makes expectations (see - * KUNIT_EXPECT_TRUE()) about code under test. Each test case is associated + * ``void (*)(struct kunit *)`` + * that makes expectations and assertions (see KUNIT_EXPECT_TRUE() and + * KUNIT_ASSERT_TRUE()) about code under test. Each test case is associated * with a &struct kunit_suite and will be run after the suite's init * function and followed by the suite's exit function. * @@ -1210,4 +1211,281 @@ do { \ fmt, \ ##__VA_ARGS__) +#define KUNIT_ASSERT_FAILURE(test, fmt, ...) \ + KUNIT_FAIL_ASSERTION(test, KUNIT_ASSERTION, fmt, ##__VA_ARGS__) + +/** + * KUNIT_ASSERT_TRUE() - Sets an assertion that @condition is true. + * @test: The test context object. + * @condition: an arbitrary boolean expression. The test fails and aborts when + * this does not evaluate to true. + * + * This and assertions of the form `KUNIT_ASSERT_*` will cause the test case to + * fail *and immediately abort* when the specified condition is not met. Unlike + * an expectation failure, it will prevent the test case from continuing to run; + * this is otherwise known as an *assertion failure*. + */ +#define KUNIT_ASSERT_TRUE(test, condition) \ + KUNIT_TRUE_ASSERTION(test, KUNIT_ASSERTION, condition) + +#define KUNIT_ASSERT_TRUE_MSG(test, condition, fmt, ...) \ + KUNIT_TRUE_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + condition, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_ASSERT_FALSE() - Sets an assertion that @condition is false. + * @test: The test context object. + * @condition: an arbitrary boolean expression. + * + * Sets an assertion that the value that @condition evaluates to is false. This + * is the same as KUNIT_EXPECT_FALSE(), except it causes an assertion failure + * (see KUNIT_ASSERT_TRUE()) when the assertion is not met. + */ +#define KUNIT_ASSERT_FALSE(test, condition) \ + KUNIT_FALSE_ASSERTION(test, KUNIT_ASSERTION, condition) + +#define KUNIT_ASSERT_FALSE_MSG(test, condition, fmt, ...) \ + KUNIT_FALSE_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + condition, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_ASSERT_EQ() - Sets an assertion that @left and @right are equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an assertion that the values that @left and @right evaluate to are + * equal. This is the same as KUNIT_EXPECT_EQ(), except it causes an assertion + * failure (see KUNIT_ASSERT_TRUE()) when the assertion is not met. + */ +#define KUNIT_ASSERT_EQ(test, left, right) \ + KUNIT_BINARY_EQ_ASSERTION(test, KUNIT_ASSERTION, left, right) + +#define KUNIT_ASSERT_EQ_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_EQ_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_ASSERT_PTR_EQ() - Asserts that pointers @left and @right are equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a pointer. + * @right: an arbitrary expression that evaluates to a pointer. + * + * Sets an assertion that the values that @left and @right evaluate to are + * equal. This is the same as KUNIT_EXPECT_EQ(), except it causes an assertion + * failure (see KUNIT_ASSERT_TRUE()) when the assertion is not met. + */ +#define KUNIT_ASSERT_PTR_EQ(test, left, right) \ + KUNIT_BINARY_PTR_EQ_ASSERTION(test, KUNIT_ASSERTION, left, right) + +#define KUNIT_ASSERT_PTR_EQ_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_PTR_EQ_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_ASSERT_NE() - An assertion that @left and @right are not equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an assertion that the values that @left and @right evaluate to are not + * equal. This is the same as KUNIT_EXPECT_NE(), except it causes an assertion + * failure (see KUNIT_ASSERT_TRUE()) when the assertion is not met. + */ +#define KUNIT_ASSERT_NE(test, left, right) \ + KUNIT_BINARY_NE_ASSERTION(test, KUNIT_ASSERTION, left, right) + +#define KUNIT_ASSERT_NE_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_NE_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_ASSERT_PTR_NE() - Asserts that pointers @left and @right are not equal. + * KUNIT_ASSERT_PTR_EQ() - Asserts that pointers @left and @right are equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a pointer. + * @right: an arbitrary expression that evaluates to a pointer. + * + * Sets an assertion that the values that @left and @right evaluate to are not + * equal. This is the same as KUNIT_EXPECT_NE(), except it causes an assertion + * failure (see KUNIT_ASSERT_TRUE()) when the assertion is not met. + */ +#define KUNIT_ASSERT_PTR_NE(test, left, right) \ + KUNIT_BINARY_PTR_NE_ASSERTION(test, KUNIT_ASSERTION, left, right) + +#define KUNIT_ASSERT_PTR_NE_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_PTR_NE_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) +/** + * KUNIT_ASSERT_LT() - An assertion that @left is less than @right. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an assertion that the value that @left evaluates to is less than the + * value that @right evaluates to. This is the same as KUNIT_EXPECT_LT(), except + * it causes an assertion failure (see KUNIT_ASSERT_TRUE()) when the assertion + * is not met. + */ +#define KUNIT_ASSERT_LT(test, left, right) \ + KUNIT_BINARY_LT_ASSERTION(test, KUNIT_ASSERTION, left, right) + +#define KUNIT_ASSERT_LT_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_LT_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) +/** + * KUNIT_ASSERT_LE() - An assertion that @left is less than or equal to @right. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an assertion that the value that @left evaluates to is less than or + * equal to the value that @right evaluates to. This is the same as + * KUNIT_EXPECT_LE(), except it causes an assertion failure (see + * KUNIT_ASSERT_TRUE()) when the assertion is not met. + */ +#define KUNIT_ASSERT_LE(test, left, right) \ + KUNIT_BINARY_LE_ASSERTION(test, KUNIT_ASSERTION, left, right) + +#define KUNIT_ASSERT_LE_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_LE_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_ASSERT_GT() - An assertion that @left is greater than @right. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an assertion that the value that @left evaluates to is greater than the + * value that @right evaluates to. This is the same as KUNIT_EXPECT_GT(), except + * it causes an assertion failure (see KUNIT_ASSERT_TRUE()) when the assertion + * is not met. + */ +#define KUNIT_ASSERT_GT(test, left, right) \ + KUNIT_BINARY_GT_ASSERTION(test, KUNIT_ASSERTION, left, right) + +#define KUNIT_ASSERT_GT_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_GT_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_ASSERT_GE() - Assertion that @left is greater than or equal to @right. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a primitive C type. + * @right: an arbitrary expression that evaluates to a primitive C type. + * + * Sets an assertion that the value that @left evaluates to is greater than the + * value that @right evaluates to. This is the same as KUNIT_EXPECT_GE(), except + * it causes an assertion failure (see KUNIT_ASSERT_TRUE()) when the assertion + * is not met. + */ +#define KUNIT_ASSERT_GE(test, left, right) \ + KUNIT_BINARY_GE_ASSERTION(test, KUNIT_ASSERTION, left, right) + +#define KUNIT_ASSERT_GE_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_GE_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_ASSERT_STREQ() - An assertion that strings @left and @right are equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a null terminated string. + * @right: an arbitrary expression that evaluates to a null terminated string. + * + * Sets an assertion that the values that @left and @right evaluate to are + * equal. This is the same as KUNIT_EXPECT_STREQ(), except it causes an + * assertion failure (see KUNIT_ASSERT_TRUE()) when the assertion is not met. + */ +#define KUNIT_ASSERT_STREQ(test, left, right) \ + KUNIT_BINARY_STR_EQ_ASSERTION(test, KUNIT_ASSERTION, left, right) + +#define KUNIT_ASSERT_STREQ_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_STR_EQ_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_ASSERT_STRNEQ() - Expects that strings @left and @right are not equal. + * @test: The test context object. + * @left: an arbitrary expression that evaluates to a null terminated string. + * @right: an arbitrary expression that evaluates to a null terminated string. + * + * Sets an expectation that the values that @left and @right evaluate to are + * not equal. This is semantically equivalent to + * KUNIT_ASSERT_TRUE(@test, strcmp((@left), (@right))). See KUNIT_ASSERT_TRUE() + * for more information. + */ +#define KUNIT_ASSERT_STRNEQ(test, left, right) \ + KUNIT_BINARY_STR_NE_ASSERTION(test, KUNIT_ASSERTION, left, right) + +#define KUNIT_ASSERT_STRNEQ_MSG(test, left, right, fmt, ...) \ + KUNIT_BINARY_STR_NE_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + left, \ + right, \ + fmt, \ + ##__VA_ARGS__) + +/** + * KUNIT_ASSERT_NOT_ERR_OR_NULL() - Assertion that @ptr is not null and not err. + * @test: The test context object. + * @ptr: an arbitrary pointer. + * + * Sets an assertion that the value that @ptr evaluates to is not null and not + * an errno stored in a pointer. This is the same as + * KUNIT_EXPECT_NOT_ERR_OR_NULL(), except it causes an assertion failure (see + * KUNIT_ASSERT_TRUE()) when the assertion is not met. + */ +#define KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr) \ + KUNIT_PTR_NOT_ERR_OR_NULL_ASSERTION(test, KUNIT_ASSERTION, ptr) + +#define KUNIT_ASSERT_NOT_ERR_OR_NULL_MSG(test, ptr, fmt, ...) \ + KUNIT_PTR_NOT_ERR_OR_NULL_MSG_ASSERTION(test, \ + KUNIT_ASSERTION, \ + ptr, \ + fmt, \ + ##__VA_ARGS__) + #endif /* _KUNIT_TEST_H */ -- cgit v1.2.3 From 741a98d022362c90609ac9dcd8ad56314d8e68b3 Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Mon, 23 Sep 2019 02:02:49 -0700 Subject: kunit: fix failure to build without printk Previously KUnit assumed that printk would always be present, which is not a valid assumption to make. Fix that by removing call to vprintk_emit, and calling printk directly. This fixes a build error[1] reported by Randy. For context this change comes after much discussion. My first stab[2] at this was just to make the KUnit logging code compile out; however, it was agreed that if we were going to use vprintk_emit, then vprintk_emit should provide a no-op stub, which lead to my second attempt[3]. In response to me trying to stub out vprintk_emit, Sergey Senozhatsky suggested a way for me to remove our usage of vprintk_emit, which led to my third attempt at solving this[4]. In my third version of this patch[4], I completely removed vprintk_emit, as suggested by Sergey; however, there was a bit of debate over whether Sergey's solution was the best. The debate arose due to Sergey's version resulting in a checkpatch warning, which resulted in a debate over correct printk usage. Joe Perches offered an alternative fix which was somewhat less far reaching than what Sergey had suggested and importantly relied on continuing to use %pV. Much of the debated centered around whether %pV should be widely used, and whether Sergey's version would result in object size bloat. Ultimately, we decided to go with Sergey's version. Reported-by: Randy Dunlap Link[1]: https://lore.kernel.org/linux-kselftest/c7229254-0d90-d90e-f3df-5b6d6fc0b51f@infradead.org/ Link[2]: https://lore.kernel.org/linux-kselftest/20190827174932.44177-1-brendanhiggins@google.com/ Link[3]: https://lore.kernel.org/linux-kselftest/20190827234835.234473-1-brendanhiggins@google.com/ Link[4]: https://lore.kernel.org/linux-kselftest/20190828093143.163302-1-brendanhiggins@google.com/ Cc: Stephen Rothwell Cc: Sergey Senozhatsky Cc: Joe Perches Cc: Tim.Bird@sony.com Signed-off-by: Brendan Higgins Acked-by: Randy Dunlap # build-tested Reviewed-by: Petr Mladek Signed-off-by: Shuah Khan --- include/kunit/test.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index 8b7eb03d4971..dba48304b3bd 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -339,9 +339,8 @@ static inline void *kunit_kzalloc(struct kunit *test, size_t size, gfp_t gfp) void kunit_cleanup(struct kunit *test); -void __printf(3, 4) kunit_printk(const char *level, - const struct kunit *test, - const char *fmt, ...); +#define kunit_printk(lvl, test, fmt, ...) \ + printk(lvl "\t# %s: " fmt, (test)->name, ##__VA_ARGS__) /** * kunit_info() - Prints an INFO level message associated with @test. -- cgit v1.2.3 From be2644aac3e1db02d09f45d56206bbdafca582a2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Oct 2019 10:49:06 -0700 Subject: tcp: add ipv6_addr_v4mapped_loopback() helper tcp_twsk_unique() has a hard coded assumption about ipv4 loopback being 127/8 Lets instead use the standard ipv4_is_loopback() method, in a new ipv6_addr_v4mapped_loopback() helper. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 009605c56f20..d04b7abe2a4c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -696,6 +696,11 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) cpu_to_be32(0x0000ffff))) == 0UL; } +static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a) +{ + return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]); +} + static inline u32 ipv6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) -- cgit v1.2.3 From ff92741270bf8b6e78aa885f166b68c7a67ab13a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:15 +0200 Subject: net: introduce name_node struct to be used in hashlist Introduce name_node structure to hold name of device and put it into hashlist instead of putting there struct net_device directly. Add a necessary infrastructure to manipulate the hashlist. This prepares the code to use the same hashlist for alternative names introduced later in this set. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9eda1c31d1f7..e92bc5467256 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -925,6 +925,12 @@ struct dev_ifalias { struct devlink; struct tlsdev_ops; +struct netdev_name_node { + struct hlist_node hlist; + struct net_device *dev; + const char *name; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1564,7 +1570,7 @@ enum netdev_priv_flags { * (i.e. as seen by users in the "Space.c" file). It is the name * of the interface. * - * @name_hlist: Device name hash chain, please keep it close to name[] + * @name_node: Name hashlist node * @ifalias: SNMP alias * @mem_end: Shared memory end * @mem_start: Shared memory start @@ -1774,7 +1780,7 @@ enum netdev_priv_flags { struct net_device { char name[IFNAMSIZ]; - struct hlist_node name_hlist; + struct netdev_name_node *name_node; struct dev_ifalias __rcu *ifalias; /* * I/O specific fields -- cgit v1.2.3 From 36fbf1e52bd3ff8a5cb604955eedfc9350c2e6cc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:16 +0200 Subject: net: rtnetlink: add linkprop commands to add and delete alternative ifnames Add two commands to add and delete list of link properties. Implement the first property type along - alternative ifnames. Each net device can have multiple alternative names. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ include/uapi/linux/if.h | 1 + include/uapi/linux/if_link.h | 2 ++ include/uapi/linux/rtnetlink.h | 7 +++++++ 4 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e92bc5467256..48cc71aae466 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -927,10 +927,14 @@ struct tlsdev_ops; struct netdev_name_node { struct hlist_node hlist; + struct list_head list; struct net_device *dev; const char *name; }; +int netdev_name_node_alt_create(struct net_device *dev, const char *name); +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index 7fea0fd7d6f5..4bf33344aab1 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -33,6 +33,7 @@ #define IFNAMSIZ 16 #endif /* __UAPI_DEF_IF_IFNAMSIZ */ #define IFALIASZ 256 +#define ALTIFNAMSIZ 128 #include /* For glibc compatibility. An empty enum does not compile. */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4a8c02cafa9a..8aec8769d944 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -167,6 +167,8 @@ enum { IFLA_NEW_IFINDEX, IFLA_MIN_MTU, IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ __IFLA_MAX }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index ce2a623abb75..1418a8362bb7 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -164,6 +164,13 @@ enum { RTM_GETNEXTHOP, #define RTM_GETNEXTHOP RTM_GETNEXTHOP + RTM_NEWLINKPROP = 108, +#define RTM_NEWLINKPROP RTM_NEWLINKPROP + RTM_DELLINKPROP, +#define RTM_DELLINKPROP RTM_DELLINKPROP + RTM_GETLINKPROP, +#define RTM_GETLINKPROP RTM_GETLINKPROP + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; -- cgit v1.2.3 From afa0df5998131153ec3036f41e76ece33bf1334f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 10:15:09 +0200 Subject: net: push loops and nb calls into helper functions Push iterations over net namespaces and netdevices from register_netdevice_notifier() and unregister_netdevice_notifier() into helper functions. Along with that introduce continue_reverse macros to make the code a bit nicer allowing to get rid of "last" marks. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/net/net_namespace.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 48cc71aae466..7b183f724fc4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2574,6 +2574,9 @@ extern rwlock_t dev_base_lock; /* Device list lock */ list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list) #define for_each_netdev_continue(net, d) \ list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list) +#define for_each_netdev_continue_reverse(net, d) \ + list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \ + dev_list) #define for_each_netdev_continue_rcu(net, d) \ list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_in_bond_rcu(bond, slave) \ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f8712bbeb2e0..c5a98e03591d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -317,7 +317,8 @@ static inline struct net *read_pnet(const possible_net_t *pnet) /* Protected by net_rwsem */ #define for_each_net(VAR) \ list_for_each_entry(VAR, &net_namespace_list, list) - +#define for_each_net_continue_reverse(VAR) \ + list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list) #define for_each_net_rcu(VAR) \ list_for_each_entry_rcu(VAR, &net_namespace_list, list) -- cgit v1.2.3 From a30c7b429f2dd980202c912fcb76442364937b4d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 10:15:10 +0200 Subject: net: introduce per-netns netdevice notifiers Often the code for example in drivers is interested in getting notifier call only from certain network namespace. In addition to the existing global netdevice notifier chain introduce per-netns chains and allow users to register to that. Eventually this would eliminate unnecessary overhead in case there are many netdevices in many network namespaces. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/net/net_namespace.h | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7b183f724fc4..fe45b2c72315 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2504,6 +2504,9 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd); int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb); struct netdev_notifier_info { struct net_device *dev; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index c5a98e03591d..5ac2bb16d4b3 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -36,6 +36,7 @@ #include #include #include +#include struct user_namespace; struct proc_dir_entry; @@ -96,6 +97,8 @@ struct net { struct list_head dev_base_head; struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; + struct raw_notifier_head netdev_chain; + unsigned int dev_base_seq; /* protected by rtnl_mutex */ int ifindex; unsigned int dev_unreg_count; -- cgit v1.2.3 From 37048e94a2dc81a5a259963117f62341e25161f7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 1 Oct 2019 22:12:50 +0300 Subject: net: dsa: Remove unused __DSA_SKB_CB macro The struct __dsa_skb_cb is supposed to span the entire 48-byte skb control block, while the struct dsa_skb_cb only the portion of it which is used by the DSA core (the rest is available as private data to drivers). The DSA_SKB_CB and __DSA_SKB_CB helpers are supposed to help retrieve this pointer based on a skb, but it turns out there is nobody directly interested in the struct __dsa_skb_cb in the kernel. So remove it. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 541fb514e31d..8c3ea0530f65 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -94,8 +94,6 @@ struct __dsa_skb_cb { u8 priv[48 - sizeof(struct dsa_skb_cb)]; }; -#define __DSA_SKB_CB(skb) ((struct __dsa_skb_cb *)((skb)->cb)) - #define DSA_SKB_CB(skb) ((struct dsa_skb_cb *)((skb)->cb)) #define DSA_SKB_CB_PRIV(skb) \ -- cgit v1.2.3 From 2d2f116d69c127099553afe0d87cf9c0bbe2759e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 12 Sep 2019 20:22:38 -0700 Subject: gpiolib: introduce devm_fwnode_gpiod_get_index() devm_fwnode_get_index_gpiod_from_child() is too long, besides the fwnode in question does not have to be a child of device node. Let's rename it to devm_fwnode_gpiod_get_index() and keep the old name for compatibility for now. Also let's add a devm_fwnode_gpiod_get() wrapper as majority of the callers need a single GPIO. Reviewed-by: Andy Shevchenko Signed-off-by: Dmitry Torokhov Reviewed-by: Mika Westerberg Link: https://lore.kernel.org/r/20190913032240.50333-2-dmitry.torokhov@gmail.com Signed-off-by: Linus Walleij --- include/linux/gpio/consumer.h | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index b70af921c614..dc0ddcd30515 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -176,11 +176,11 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, const char *propname, int index, enum gpiod_flags dflags, const char *label); -struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev, - const char *con_id, int index, - struct fwnode_handle *child, - enum gpiod_flags flags, - const char *label); +struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, + struct fwnode_handle *child, + const char *con_id, int index, + enum gpiod_flags flags, + const char *label); #else /* CONFIG_GPIOLIB */ @@ -531,6 +531,29 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, return ERR_PTR(-ENOSYS); } +static inline +struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, + struct fwnode_handle *fwnode, + const char *con_id, int index, + enum gpiod_flags flags, + const char *label) +{ + return ERR_PTR(-ENOSYS); +} + +#endif /* CONFIG_GPIOLIB */ + +static inline +struct gpio_desc *devm_fwnode_gpiod_get(struct device *dev, + struct fwnode_handle *fwnode, + const char *con_id, + enum gpiod_flags flags, + const char *label) +{ + return devm_fwnode_gpiod_get_index(dev, fwnode, con_id, 0, + flags, label); +} + static inline struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev, const char *con_id, int index, @@ -538,11 +561,10 @@ struct gpio_desc *devm_fwnode_get_index_gpiod_from_child(struct device *dev, enum gpiod_flags flags, const char *label) { - return ERR_PTR(-ENOSYS); + return devm_fwnode_gpiod_get_index(dev, child, con_id, index, + flags, label); } -#endif /* CONFIG_GPIOLIB */ - static inline struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev, const char *con_id, @@ -550,8 +572,7 @@ struct gpio_desc *devm_fwnode_get_gpiod_from_child(struct device *dev, enum gpiod_flags flags, const char *label) { - return devm_fwnode_get_index_gpiod_from_child(dev, con_id, 0, child, - flags, label); + return devm_fwnode_gpiod_get_index(dev, child, con_id, 0, flags, label); } #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_OF_GPIO) -- cgit v1.2.3 From 13949fa9daa91a60c7cfef40755f7611cc2cf653 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 12 Sep 2019 20:22:39 -0700 Subject: gpiolib: introduce fwnode_gpiod_get_index() This introduces fwnode_gpiod_get_index() that iterates through common gpio suffixes when trying to locate a GPIO within a given firmware node. We also switch devm_fwnode_gpiod_get_index() to call fwnode_gpiod_get_index() instead of iterating through GPIO suffixes on its own. Reviewed-by: Andy Shevchenko Signed-off-by: Dmitry Torokhov Link: https://lore.kernel.org/r/20190913032240.50333-3-dmitry.torokhov@gmail.com Reviewed-by: Mika Westerberg Signed-off-by: Linus Walleij --- include/linux/gpio/consumer.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index dc0ddcd30515..5215fdba6b9a 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -176,6 +176,10 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, const char *propname, int index, enum gpiod_flags dflags, const char *label); +struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode, + const char *con_id, int index, + enum gpiod_flags flags, + const char *label); struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, struct fwnode_handle *child, const char *con_id, int index, @@ -531,6 +535,15 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, return ERR_PTR(-ENOSYS); } +static inline +struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode, + const char *con_id, int index, + enum gpiod_flags flags, + const char *label) +{ + return ERR_PTR(-ENOSYS); +} + static inline struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, struct fwnode_handle *fwnode, -- cgit v1.2.3 From 968a2978cb39a754750d35a47049781660682a31 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 2 Oct 2019 16:52:57 +0200 Subject: net: stmmac: Only enable enhanced addressing mode when needed Enhanced addressing mode is only required when more than 32 bits need to be addressed. Add a DMA configuration parameter to enable this mode only when needed. Signed-off-by: Thierry Reding Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index dc60d03c4b60..86f9464c3f5d 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -92,6 +92,7 @@ struct stmmac_dma_cfg { int fixed_burst; int mixed_burst; bool aal; + bool eame; }; #define AXI_BLEN 7 -- cgit v1.2.3 From 4b76f9ed47074990d85c2ec52288a7d09c3ea357 Mon Sep 17 00:00:00 2001 From: Sunil Dutt Date: Fri, 13 Sep 2019 18:11:44 +0530 Subject: nl80211: Document the expectation for NL80211_ATTR_IE in NL80211_CMD_CONNECT This commit documents the expectation for NL80211_ATTR_IE when included in NL80211_CMD_CONNECT, as following. Driver shall not modify the IEs specified through NL80211_ATTR_IE if NL80211_ATTR_MAC is included. However, if NL80211_ATTR_MAC_HINT is included, these IEs through NL80211_ATTR_IE are specified by the user space based on the best possible BSS selected. Thus, if the driver ends up selecting a different BSS, it can modify these IEs accordingly (e.g. userspace asks the driver to perform PMKSA caching with BSS1 and the driver ends up selecting BSS2 with different PMKSA cache entry. RSNIE has to get updated with the apt PMKID). Signed-off-by: Sunil Dutt Link: https://lore.kernel.org/r/1568378504-15179-1-git-send-email-usdutt@codeaurora.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index beee59c831a7..64135ab3a7ac 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -571,6 +571,14 @@ * set of BSSID,frequency parameters is used (i.e., either the enforcing * %NL80211_ATTR_MAC,%NL80211_ATTR_WIPHY_FREQ or the less strict * %NL80211_ATTR_MAC_HINT and %NL80211_ATTR_WIPHY_FREQ_HINT). + * Driver shall not modify the IEs specified through %NL80211_ATTR_IE if + * %NL80211_ATTR_MAC is included. However, if %NL80211_ATTR_MAC_HINT is + * included, these IEs through %NL80211_ATTR_IE are specified by the user + * space based on the best possible BSS selected. Thus, if the driver ends + * up selecting a different BSS, it can modify these IEs accordingly (e.g. + * userspace asks the driver to perform PMKSA caching with BSS1 and the + * driver ends up selecting BSS2 with different PMKSA cache entry; RSNIE + * has to get updated with the apt PMKID). * %NL80211_ATTR_PREV_BSSID can be used to request a reassociation within * the ESS in case the device is already associated and an association with * a different BSS is desired. -- cgit v1.2.3 From 2ce113de31320756b25179f3f4512a522bc45263 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Oct 2019 11:12:25 +0200 Subject: mac80211: simplify TX aggregation start There really is no need to make drivers call the ieee80211_start_tx_ba_cb_irqsafe() function and then schedule the worker if all we want is to set a bit. Add a new return value (that was previously considered invalid) to indicate that the driver is immediately ready for the session, and make drivers use it. The only drivers that remain different are the Intel ones as they need to negotiate more with the firmware. Link: https://lore.kernel.org/r/1570007543-I152912660131cbab2e5d80b4218238c20f8a06e5@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 523c6a09e1c8..d69081c38788 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3095,7 +3095,9 @@ enum ieee80211_filter_flags { * * @IEEE80211_AMPDU_RX_START: start RX aggregation * @IEEE80211_AMPDU_RX_STOP: stop RX aggregation - * @IEEE80211_AMPDU_TX_START: start TX aggregation + * @IEEE80211_AMPDU_TX_START: start TX aggregation, the driver must either + * call ieee80211_start_tx_ba_cb_irqsafe() or return the special + * status %IEEE80211_AMPDU_TX_START_IMMEDIATE. * @IEEE80211_AMPDU_TX_OPERATIONAL: TX aggregation has become operational * @IEEE80211_AMPDU_TX_STOP_CONT: stop TX aggregation but continue transmitting * queued packets, now unaggregated. After all packets are transmitted the @@ -3119,6 +3121,8 @@ enum ieee80211_ampdu_mlme_action { IEEE80211_AMPDU_TX_OPERATIONAL, }; +#define IEEE80211_AMPDU_TX_START_IMMEDIATE 1 + /** * struct ieee80211_ampdu_params - AMPDU action parameters * @@ -3896,7 +3900,10 @@ struct ieee80211_ops { * * Even ``189`` would be wrong since 1 could be lost again. * - * Returns a negative error code on failure. + * Returns a negative error code on failure. The driver may return + * %IEEE80211_AMPDU_TX_START_IMMEDIATE for %IEEE80211_AMPDU_TX_START + * if the session can start immediately. + * * The callback can sleep. */ int (*ampdu_action)(struct ieee80211_hw *hw, -- cgit v1.2.3 From 5b0fe9552336338acb52756daf65dd7a4eeca73f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 10 Sep 2019 11:42:05 +1000 Subject: crypto: algif_skcipher - Use chunksize instead of blocksize When algif_skcipher does a partial operation it always process data that is a multiple of blocksize. However, for algorithms such as CTR this is wrong because even though it can process any number of bytes overall, the partial block must come at the very end and not in the middle. This is exactly what chunksize is meant to describe so this patch changes blocksize to chunksize. Fixes: 8ff590903d5f ("crypto: algif_skcipher - User-space...") Signed-off-by: Herbert Xu Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/internal/skcipher.h | 30 ------------------------------ include/crypto/skcipher.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index 734b6f7081b8..3175dfeaed2c 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -205,19 +205,6 @@ static inline unsigned int crypto_skcipher_alg_max_keysize( return alg->max_keysize; } -static inline unsigned int crypto_skcipher_alg_chunksize( - struct skcipher_alg *alg) -{ - if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == - CRYPTO_ALG_TYPE_BLKCIPHER) - return alg->base.cra_blocksize; - - if (alg->base.cra_ablkcipher.encrypt) - return alg->base.cra_blocksize; - - return alg->chunksize; -} - static inline unsigned int crypto_skcipher_alg_walksize( struct skcipher_alg *alg) { @@ -231,23 +218,6 @@ static inline unsigned int crypto_skcipher_alg_walksize( return alg->walksize; } -/** - * crypto_skcipher_chunksize() - obtain chunk size - * @tfm: cipher handle - * - * The block size is set to one for ciphers such as CTR. However, - * you still need to provide incremental updates in multiples of - * the underlying block size as the IV does not have sub-block - * granularity. This is known in this API as the chunk size. - * - * Return: chunk size in bytes - */ -static inline unsigned int crypto_skcipher_chunksize( - struct crypto_skcipher *tfm) -{ - return crypto_skcipher_alg_chunksize(crypto_skcipher_alg(tfm)); -} - /** * crypto_skcipher_walksize() - obtain walk size * @tfm: cipher handle diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index 37c164234d97..aada87916918 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -304,6 +304,36 @@ static inline unsigned int crypto_skcipher_blocksize( return crypto_tfm_alg_blocksize(crypto_skcipher_tfm(tfm)); } +static inline unsigned int crypto_skcipher_alg_chunksize( + struct skcipher_alg *alg) +{ + if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == + CRYPTO_ALG_TYPE_BLKCIPHER) + return alg->base.cra_blocksize; + + if (alg->base.cra_ablkcipher.encrypt) + return alg->base.cra_blocksize; + + return alg->chunksize; +} + +/** + * crypto_skcipher_chunksize() - obtain chunk size + * @tfm: cipher handle + * + * The block size is set to one for ciphers such as CTR. However, + * you still need to provide incremental updates in multiples of + * the underlying block size as the IV does not have sub-block + * granularity. This is known in this API as the chunk size. + * + * Return: chunk size in bytes + */ +static inline unsigned int crypto_skcipher_chunksize( + struct crypto_skcipher *tfm) +{ + return crypto_skcipher_alg_chunksize(crypto_skcipher_alg(tfm)); +} + static inline unsigned int crypto_sync_skcipher_blocksize( struct crypto_sync_skcipher *tfm) { -- cgit v1.2.3 From 7c550daffe22a97282effa75fe7c1f6b83563ecb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:27 +0200 Subject: net: fib_notifier: make FIB notifier per-netns Currently all users of FIB notifier only cares about events in init_net. Later in this patchset, users get interested in other namespaces too. However, for every registered block user is interested only about one namespace. Make the FIB notifier registration per-netns and avoid unnecessary calls of notifier block for other namespaces. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 10 ++-------- include/net/fib_notifier.h | 7 +++---- include/net/ip6_fib.h | 2 +- include/net/ip_fib.h | 2 +- 4 files changed, 7 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 34de06b426ef..0931631bbc13 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -47,7 +47,6 @@ struct vif_entry_notifier_info { }; static inline int mr_call_vif_notifier(struct notifier_block *nb, - struct net *net, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, @@ -56,7 +55,6 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb, struct vif_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .dev = vif->dev, .vif_index = vif_index, @@ -64,7 +62,7 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb, .tb_id = tb_id, }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_vif_notifiers(struct net *net, @@ -77,7 +75,6 @@ static inline int mr_call_vif_notifiers(struct net *net, struct vif_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .dev = vif->dev, .vif_index = vif_index, @@ -173,7 +170,6 @@ struct mfc_entry_notifier_info { }; static inline int mr_call_mfc_notifier(struct notifier_block *nb, - struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id) @@ -181,13 +177,12 @@ static inline int mr_call_mfc_notifier(struct notifier_block *nb, struct mfc_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .mfc = mfc, .tb_id = tb_id }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_mfc_notifiers(struct net *net, @@ -199,7 +194,6 @@ static inline int mr_call_mfc_notifiers(struct net *net, struct mfc_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .mfc = mfc, .tb_id = tb_id diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index c49d7bfb5c30..23353f67b2b0 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -8,7 +8,6 @@ struct module; struct fib_notifier_info { - struct net *net; int family; struct netlink_ext_ack *extack; }; @@ -35,14 +34,14 @@ struct fib_notifier_ops { struct rcu_head rcu; }; -int call_fib_notifier(struct notifier_block *nb, struct net *net, +int call_fib_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); -int register_fib_notifier(struct notifier_block *nb, +int register_fib_notifier(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb)); -int unregister_fib_notifier(struct notifier_block *nb); +int unregister_fib_notifier(struct net *net, struct notifier_block *nb); struct fib_notifier_ops * fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net); void fib_notifier_ops_unregister(struct fib_notifier_ops *ops); diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 4b5656c71abc..14e9fca0e326 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -478,7 +478,7 @@ struct ipv6_route_iter { extern const struct seq_operations ipv6_route_seq_ops; -int call_fib6_notifier(struct notifier_block *nb, struct net *net, +int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index ab1ca9e238d2..a9df85304f40 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -219,7 +219,7 @@ struct fib_nh_notifier_info { struct fib_nh *fib_nh; }; -int call_fib4_notifier(struct notifier_block *nb, struct net *net, +int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, -- cgit v1.2.3 From 55c894f762a1a99fca80ee55d593083d78e7e4fb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:28 +0200 Subject: net: fib_notifier: propagate possible error during fib notifier registration Unlike events for registered notifier, during the registration, the errors that happened for the block being registered are not propagated up to the caller. Make sure the error is propagated for FIB rules and entries. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a9df85304f40..05c1fd9c5e23 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -229,7 +229,7 @@ int __net_init fib4_notifier_init(struct net *net); void __net_exit fib4_notifier_exit(struct net *net); void fib_info_notify_update(struct net *net, struct nl_info *info); -void fib_notify(struct net *net, struct notifier_block *nb); +int fib_notify(struct net *net, struct notifier_block *nb); struct fib_table { struct hlist_node tb_hlist; -- cgit v1.2.3 From b7a595577ef3dc9add2b3e6d00869d017306bfbe Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:30 +0200 Subject: net: fib_notifier: propagate extack down to the notifier block callback Since errors are propagated all the way up to the caller, propagate possible extack of the caller all the way down to the notifier block callback. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 18 ++++++++++++------ include/net/fib_notifier.h | 6 ++++-- include/net/fib_rules.h | 3 ++- include/net/ip6_fib.h | 9 ++++++--- include/net/ip_fib.h | 9 ++++++--- 5 files changed, 30 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0931631bbc13..8071148f29a6 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -50,11 +50,13 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, - unsigned short vif_index, u32 tb_id) + unsigned short vif_index, u32 tb_id, + struct netlink_ext_ack *extack) { struct vif_entry_notifier_info info = { .info = { .family = family, + .extack = extack, }, .dev = vif->dev, .vif_index = vif_index, @@ -172,11 +174,13 @@ struct mfc_entry_notifier_info { static inline int mr_call_mfc_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, - struct mr_mfc *mfc, u32 tb_id) + struct mr_mfc *mfc, u32 tb_id, + struct netlink_ext_ack *extack) { struct mfc_entry_notifier_info info = { .info = { .family = family, + .extack = extack, }, .mfc = mfc, .tb_id = tb_id @@ -295,10 +299,11 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock); + rwlock_t *mrt_lock, struct netlink_ext_ack *extack); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, @@ -349,10 +354,11 @@ mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, static inline int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock) + rwlock_t *mrt_lock, struct netlink_ext_ack *extack) { return -EINVAL; } diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 23353f67b2b0..6d59221ff05a 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -29,7 +29,8 @@ struct fib_notifier_ops { int family; struct list_head list; unsigned int (*fib_seq_read)(struct net *net); - int (*fib_dump)(struct net *net, struct notifier_block *nb); + int (*fib_dump)(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); struct module *owner; struct rcu_head rcu; }; @@ -40,7 +41,8 @@ int call_fib_notifier(struct notifier_block *nb, int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); int register_fib_notifier(struct net *net, struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)); + void (*cb)(struct notifier_block *nb), + struct netlink_ext_ack *extack); int unregister_fib_notifier(struct net *net, struct notifier_block *nb); struct fib_notifier_ops * fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net); diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 20dcadd8eed9..54e227e6b06a 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -194,7 +194,8 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, u32 flags); bool fib_rule_matchall(const struct fib_rule *rule); -int fib_rules_dump(struct net *net, struct notifier_block *nb, int family); +int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, + struct netlink_ext_ack *extack); unsigned int fib_rules_seq_read(struct net *net, int family); int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 14e9fca0e326..5d1615463138 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -488,7 +488,8 @@ int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); -int fib6_tables_dump(struct net *net, struct notifier_block *nb); +int fib6_tables_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); void fib6_update_sernum(struct net *net, struct fib6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); @@ -504,7 +505,8 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) int fib6_rules_init(void); void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); -int fib6_rules_dump(struct net *net, struct notifier_block *nb); +int fib6_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); unsigned int fib6_rules_seq_read(struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, @@ -537,7 +539,8 @@ static inline bool fib6_rule_default(const struct fib_rule *rule) { return true; } -static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb) +static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 05c1fd9c5e23..52b2406a5dfc 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -229,7 +229,8 @@ int __net_init fib4_notifier_init(struct net *net); void __net_exit fib4_notifier_exit(struct net *net); void fib_info_notify_update(struct net *net, struct nl_info *info); -int fib_notify(struct net *net, struct notifier_block *nb); +int fib_notify(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); struct fib_table { struct hlist_node tb_hlist; @@ -315,7 +316,8 @@ static inline bool fib4_rule_default(const struct fib_rule *rule) return true; } -static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb) +static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -377,7 +379,8 @@ out: } bool fib4_rule_default(const struct fib_rule *rule); -int fib4_rules_dump(struct net *net, struct notifier_block *nb); +int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); unsigned int fib4_rules_seq_read(struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, -- cgit v1.2.3 From 471f894f106573b0b086d1003ee6172253c67b59 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:31 +0200 Subject: net: devlink: export devlink net getter Allow drivers to get net struct for devlink instance. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 23e4b65ec9df..5ac2be0f0857 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -771,6 +771,7 @@ static inline struct devlink *netdev_to_devlink(struct net_device *dev) struct ib_device; +struct net *devlink_net(const struct devlink *devlink); struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); int devlink_register(struct devlink *devlink, struct device *dev); void devlink_unregister(struct devlink *devlink); -- cgit v1.2.3 From 070c63f20f6c739a3c534555f56c7327536bfcc2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:39 +0200 Subject: net: devlink: allow to change namespaces during reload All devlink instances are created in init_net and stay there for a lifetime. Allow user to be able to move devlink instances into namespaces during devlink reload operation. That ensures proper re-instantiation of driver objects, including netdevices. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 2 +- include/uapi/linux/devlink.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 5ac2be0f0857..3c9d4a063c98 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -643,7 +643,7 @@ enum devlink_trap_group_generic_id { } struct devlink_ops { - int (*reload_down)(struct devlink *devlink, + int (*reload_down)(struct devlink *devlink, bool netns_change, struct netlink_ext_ack *extack); int (*reload_up)(struct devlink *devlink, struct netlink_ext_ack *extack); diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 580b7a2e40e1..b558ea88b766 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -421,6 +421,10 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_FAILED, /* u8 0 or 1 */ + DEVLINK_ATTR_NETNS_FD, /* u32 */ + DEVLINK_ATTR_NETNS_PID, /* u32 */ + DEVLINK_ATTR_NETNS_ID, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From 25a3cd8189c8832c04225e6f1d41228fd6cc64cc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Oct 2019 11:18:54 -0700 Subject: net/tls: move TOE-related structures to a separate header Move tls_device structure and register/unregister functions to a new header to avoid confusion with normal, non-TOE offload. Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 34 ------------------------ include/net/tls_toe.h | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 34 deletions(-) create mode 100644 include/net/tls_toe.h (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index c664e6dba0d1..57865c944095 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -60,7 +60,6 @@ #define TLS_RECORD_TYPE_DATA 0x17 #define TLS_AAD_SPACE_SIZE 13 -#define TLS_DEVICE_NAME_MAX 32 #define MAX_IV_SIZE 16 #define TLS_MAX_REC_SEQ_SIZE 8 @@ -74,37 +73,6 @@ */ #define TLS_AES_CCM_IV_B0_BYTE 2 -/* - * This structure defines the routines for Inline TLS driver. - * The following routines are optional and filled with a - * null pointer if not defined. - * - * @name: Its the name of registered Inline tls device - * @dev_list: Inline tls device list - * int (*feature)(struct tls_device *device); - * Called to return Inline TLS driver capability - * - * int (*hash)(struct tls_device *device, struct sock *sk); - * This function sets Inline driver for listen and program - * device specific functioanlity as required - * - * void (*unhash)(struct tls_device *device, struct sock *sk); - * This function cleans listen state set by Inline TLS driver - * - * void (*release)(struct kref *kref); - * Release the registered device and allocated resources - * @kref: Number of reference to tls_device - */ -struct tls_device { - char name[TLS_DEVICE_NAME_MAX]; - struct list_head dev_list; - int (*feature)(struct tls_device *device); - int (*hash)(struct tls_device *device, struct sock *sk); - void (*unhash)(struct tls_device *device, struct sock *sk); - void (*release)(struct kref *kref); - struct kref kref; -}; - enum { TLS_BASE, TLS_SW, @@ -643,8 +611,6 @@ static inline bool tls_offload_tx_resync_pending(struct sock *sk) int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); -void tls_register_device(struct tls_device *device); -void tls_unregister_device(struct tls_device *device); int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout); struct sk_buff *tls_encrypt_skb(struct sk_buff *skb); diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h new file mode 100644 index 000000000000..81b66c76b31f --- /dev/null +++ b/include/net/tls_toe.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. + * Copyright (c) 2016-2017, Dave Watson . All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +struct sock; + +#define TLS_DEVICE_NAME_MAX 32 + +/* + * This structure defines the routines for Inline TLS driver. + * The following routines are optional and filled with a + * null pointer if not defined. + * + * @name: Its the name of registered Inline tls device + * @dev_list: Inline tls device list + * int (*feature)(struct tls_device *device); + * Called to return Inline TLS driver capability + * + * int (*hash)(struct tls_device *device, struct sock *sk); + * This function sets Inline driver for listen and program + * device specific functioanlity as required + * + * void (*unhash)(struct tls_device *device, struct sock *sk); + * This function cleans listen state set by Inline TLS driver + * + * void (*release)(struct kref *kref); + * Release the registered device and allocated resources + * @kref: Number of reference to tls_device + */ +struct tls_device { + char name[TLS_DEVICE_NAME_MAX]; + struct list_head dev_list; + int (*feature)(struct tls_device *device); + int (*hash)(struct tls_device *device, struct sock *sk); + void (*unhash)(struct tls_device *device, struct sock *sk); + void (*release)(struct kref *kref); + struct kref kref; +}; + +void tls_register_device(struct tls_device *device); +void tls_unregister_device(struct tls_device *device); -- cgit v1.2.3 From f21912edd1570818cbcb16bd1da7d7a2b122d66b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Oct 2019 11:18:55 -0700 Subject: net/tls: rename tls_device to tls_toe_device Rename struct tls_device to struct tls_toe_device to avoid confusion with normal, non-TOE offload. No functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls_toe.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h index 81b66c76b31f..b56d30a5bd6d 100644 --- a/include/net/tls_toe.h +++ b/include/net/tls_toe.h @@ -36,7 +36,7 @@ struct sock; -#define TLS_DEVICE_NAME_MAX 32 +#define TLS_TOE_DEVICE_NAME_MAX 32 /* * This structure defines the routines for Inline TLS driver. @@ -45,29 +45,29 @@ struct sock; * * @name: Its the name of registered Inline tls device * @dev_list: Inline tls device list - * int (*feature)(struct tls_device *device); + * int (*feature)(struct tls_toe_device *device); * Called to return Inline TLS driver capability * - * int (*hash)(struct tls_device *device, struct sock *sk); + * int (*hash)(struct tls_toe_device *device, struct sock *sk); * This function sets Inline driver for listen and program * device specific functioanlity as required * - * void (*unhash)(struct tls_device *device, struct sock *sk); + * void (*unhash)(struct tls_toe_device *device, struct sock *sk); * This function cleans listen state set by Inline TLS driver * * void (*release)(struct kref *kref); * Release the registered device and allocated resources - * @kref: Number of reference to tls_device + * @kref: Number of reference to tls_toe_device */ -struct tls_device { - char name[TLS_DEVICE_NAME_MAX]; +struct tls_toe_device { + char name[TLS_TOE_DEVICE_NAME_MAX]; struct list_head dev_list; - int (*feature)(struct tls_device *device); - int (*hash)(struct tls_device *device, struct sock *sk); - void (*unhash)(struct tls_device *device, struct sock *sk); + int (*feature)(struct tls_toe_device *device); + int (*hash)(struct tls_toe_device *device, struct sock *sk); + void (*unhash)(struct tls_toe_device *device, struct sock *sk); void (*release)(struct kref *kref); struct kref kref; }; -void tls_register_device(struct tls_device *device); -void tls_unregister_device(struct tls_device *device); +void tls_toe_register_device(struct tls_toe_device *device); +void tls_toe_unregister_device(struct tls_toe_device *device); -- cgit v1.2.3 From 08700dab816847d5e600ef263155fb04ea4b312d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Oct 2019 11:18:57 -0700 Subject: net/tls: move TOE-related code to a separate file Move tls_hw_* functions to a new, separate source file to avoid confusion with normal, non-TOE offload. Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 3 +++ include/net/tls_toe.h | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 57865c944095..5c48cb9e0c18 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -308,7 +308,10 @@ struct tls_offload_context_rx { #define TLS_OFFLOAD_CONTEXT_SIZE_RX \ (sizeof(struct tls_offload_context_rx) + TLS_DRIVER_STATE_SIZE_RX) +struct tls_context *tls_ctx_create(struct sock *sk); void tls_ctx_free(struct sock *sk, struct tls_context *ctx); +void update_sk_prot(struct sock *sk, struct tls_context *ctx); + int wait_on_pending_writer(struct sock *sk, long *timeo); int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h index b56d30a5bd6d..3bb39c795aed 100644 --- a/include/net/tls_toe.h +++ b/include/net/tls_toe.h @@ -69,5 +69,9 @@ struct tls_toe_device { struct kref kref; }; +int tls_hw_prot(struct sock *sk); +int tls_hw_hash(struct sock *sk); +void tls_hw_unhash(struct sock *sk); + void tls_toe_register_device(struct tls_toe_device *device); void tls_toe_unregister_device(struct tls_toe_device *device); -- cgit v1.2.3 From 0eb8745e03c9ed2a7412c7a844ebc4f0e4f80de4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Oct 2019 11:18:58 -0700 Subject: net/tls: rename tls_hw_* functions tls_toe_* The tls_hw_* functions are quite confusingly named, since they are related to the TOE-offload, not TLS_HW offload which doesn't require TOE. Rename them. Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls_toe.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h index 3bb39c795aed..b3aa7593ce2c 100644 --- a/include/net/tls_toe.h +++ b/include/net/tls_toe.h @@ -69,9 +69,9 @@ struct tls_toe_device { struct kref kref; }; -int tls_hw_prot(struct sock *sk); -int tls_hw_hash(struct sock *sk); -void tls_hw_unhash(struct sock *sk); +int tls_toe_bypass(struct sock *sk); +int tls_toe_hash(struct sock *sk); +void tls_toe_unhash(struct sock *sk); void tls_toe_register_device(struct tls_toe_device *device); void tls_toe_unregister_device(struct tls_toe_device *device); -- cgit v1.2.3 From d6547f2a2cfc8b145b59291d3e4b072891f34882 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Oct 2019 23:29:24 +0300 Subject: net, uapi: fix -Wpointer-arith warnings Add casts to fix these warnings: ./usr/include/linux/netfilter_arp/arp_tables.h:200:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_bridge/ebtables.h:197:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_ipv4/ip_tables.h:223:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_ipv6/ip6_tables.h:263:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/tipc_config.h:310:28: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/tipc_config.h:410:24: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/virtio_ring.h:170:16: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] Those are theoretical probably but kernel doesn't control compiler flags in userspace. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/uapi/linux/netfilter_arp/arp_tables.h | 2 +- include/uapi/linux/netfilter_bridge/ebtables.h | 2 +- include/uapi/linux/netfilter_ipv4/ip_tables.h | 2 +- include/uapi/linux/netfilter_ipv6/ip6_tables.h | 2 +- include/uapi/linux/tipc_config.h | 4 ++-- include/uapi/linux/virtio_ring.h | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter_arp/arp_tables.h b/include/uapi/linux/netfilter_arp/arp_tables.h index a2a0927d9bd6..bbf5af2b67a8 100644 --- a/include/uapi/linux/netfilter_arp/arp_tables.h +++ b/include/uapi/linux/netfilter_arp/arp_tables.h @@ -199,7 +199,7 @@ struct arpt_get_entries { /* Helper functions */ static __inline__ struct xt_entry_target *arpt_get_target(struct arpt_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index 8076c940ffeb..a494cf43a755 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -194,7 +194,7 @@ struct ebt_entry { static __inline__ struct ebt_entry_target * ebt_get_target(struct ebt_entry *e) { - return (void *)e + e->target_offset; + return (struct ebt_entry_target *)((char *)e + e->target_offset); } /* {g,s}etsockopt numbers */ diff --git a/include/uapi/linux/netfilter_ipv4/ip_tables.h b/include/uapi/linux/netfilter_ipv4/ip_tables.h index 6aaeb14bfce1..50c7fee625ae 100644 --- a/include/uapi/linux/netfilter_ipv4/ip_tables.h +++ b/include/uapi/linux/netfilter_ipv4/ip_tables.h @@ -222,7 +222,7 @@ struct ipt_get_entries { static __inline__ struct xt_entry_target * ipt_get_target(struct ipt_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/netfilter_ipv6/ip6_tables.h b/include/uapi/linux/netfilter_ipv6/ip6_tables.h index 031d0a43bed2..d9e364f96a5c 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6_tables.h +++ b/include/uapi/linux/netfilter_ipv6/ip6_tables.h @@ -262,7 +262,7 @@ struct ip6t_get_entries { static __inline__ struct xt_entry_target * ip6t_get_target(struct ip6t_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h index 4955e1a9f1bc..4dfc05651c98 100644 --- a/include/uapi/linux/tipc_config.h +++ b/include/uapi/linux/tipc_config.h @@ -309,7 +309,7 @@ static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len) tlv_ptr->tlv_len = htons(tlv_len); if (len && data) { memcpy(TLV_DATA(tlv_ptr), data, len); - memset(TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len); + memset((char *)TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len); } return TLV_SPACE(len); } @@ -409,7 +409,7 @@ static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags, tcm_hdr->tcm_flags = htons(flags); if (data_len && data) { memcpy(TCM_DATA(msg), data, data_len); - memset(TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len); + memset((char *)TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len); } return TCM_SPACE(data_len); } diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index 4c4e24c291a5..559f42e73315 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -169,7 +169,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, { vr->num = num; vr->desc = p; - vr->avail = p + num*sizeof(struct vring_desc); + vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc)); vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16) + align-1) & ~(align - 1)); } -- cgit v1.2.3 From 193d357d087309f2d5ab8e8caab1af5e3bc29fa0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Oct 2019 23:56:37 +0300 Subject: net: spread "enum sock_flags" Some ints are "enum sock_flags" in fact. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 2c53f1a1d905..ab905c4b1f0e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2512,7 +2512,7 @@ static inline bool sk_listener(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } -void sock_enable_timestamp(struct sock *sk, int flag); +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); -- cgit v1.2.3 From 8538d29cea9530f114159e06bfa31b2358161493 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:22 -0700 Subject: net/tls: add tracing for device/offload events Add tracing of device-related interaction to aid performance analysis, especially around resync: tls:tls_device_offload_set tls:tls_device_rx_resync_send tls:tls_device_rx_resync_nh_schedule tls:tls_device_rx_resync_nh_delay tls:tls_device_tx_resync_req tls:tls_device_tx_resync_send Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/tls.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 5c48cb9e0c18..38086ade65ce 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -594,13 +594,6 @@ tls_offload_rx_resync_set_type(struct sock *sk, enum tls_offload_sync_type type) tls_offload_ctx_rx(tls_ctx)->resync_type = type; } -static inline void tls_offload_tx_resync_request(struct sock *sk) -{ - struct tls_context *tls_ctx = tls_get_ctx(sk); - - WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags)); -} - /* Driver's seq tracking has to be disabled until resync succeeded */ static inline bool tls_offload_tx_resync_pending(struct sock *sk) { @@ -634,6 +627,7 @@ void tls_device_free_resources_tx(struct sock *sk); int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); void tls_device_offload_cleanup_rx(struct sock *sk); void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq); +void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq); int tls_device_decrypted(struct sock *sk, struct sk_buff *skb); #else static inline void tls_device_init(void) {} -- cgit v1.2.3 From d26b698dd3cd52f5a3277446a87e5e0198c99cd0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:24 -0700 Subject: net/tls: add skeleton of MIB statistics Add a skeleton structure for adding TLS statistics. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netns/mib.h | 3 +++ include/net/snmp.h | 6 ++++++ include/net/tls.h | 13 +++++++++++++ include/uapi/linux/snmp.h | 7 +++++++ 4 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h index 830bdf345b17..b5fdb108d602 100644 --- a/include/net/netns/mib.h +++ b/include/net/netns/mib.h @@ -24,6 +24,9 @@ struct netns_mib { #ifdef CONFIG_XFRM_STATISTICS DEFINE_SNMP_STAT(struct linux_xfrm_mib, xfrm_statistics); #endif +#if IS_ENABLED(CONFIG_TLS) + DEFINE_SNMP_STAT(struct linux_tls_mib, tls_statistics); +#endif }; #endif diff --git a/include/net/snmp.h b/include/net/snmp.h index cb8ced4380a6..468a67836e2f 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -111,6 +111,12 @@ struct linux_xfrm_mib { unsigned long mibs[LINUX_MIB_XFRMMAX]; }; +/* Linux TLS */ +#define LINUX_MIB_TLSMAX __LINUX_MIB_TLSMAX +struct linux_tls_mib { + unsigned long mibs[LINUX_MIB_TLSMAX]; +}; + #define DEFINE_SNMP_STAT(type, name) \ __typeof__(type) __percpu *name #define DEFINE_SNMP_STAT_ATOMIC(type, name) \ diff --git a/include/net/tls.h b/include/net/tls.h index 38086ade65ce..24c37bffc961 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -43,6 +43,7 @@ #include #include +#include #include #include #include @@ -73,6 +74,15 @@ */ #define TLS_AES_CCM_IV_B0_BYTE 2 +#define __TLS_INC_STATS(net, field) \ + __SNMP_INC_STATS((net)->mib.tls_statistics, field) +#define TLS_INC_STATS(net, field) \ + SNMP_INC_STATS((net)->mib.tls_statistics, field) +#define __TLS_DEC_STATS(net, field) \ + __SNMP_DEC_STATS((net)->mib.tls_statistics, field) +#define TLS_DEC_STATS(net, field) \ + SNMP_DEC_STATS((net)->mib.tls_statistics, field) + enum { TLS_BASE, TLS_SW, @@ -605,6 +615,9 @@ static inline bool tls_offload_tx_resync_pending(struct sock *sk) return ret; } +int __net_init tls_proc_init(struct net *net); +void __net_exit tls_proc_fini(struct net *net); + int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); int decrypt_skb(struct sock *sk, struct sk_buff *skb, diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 549a31c29f7d..4abd57948ad4 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -323,4 +323,11 @@ enum __LINUX_MIB_XFRMMAX }; +/* linux TLS mib definitions */ +enum +{ + LINUX_MIB_TLSNUM = 0, + __LINUX_MIB_TLSMAX +}; + #endif /* _LINUX_SNMP_H */ -- cgit v1.2.3 From b32fd3cc31d723bf2ab859667be3612c0086ec72 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:25 -0700 Subject: net/tls: add statistics for installed sessions Add SNMP stats for number of sockets with successfully installed sessions. Break them down to software and hardware ones. Note that if hardware offload fails stack uses software implementation, and counts the session appropriately. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 4abd57948ad4..1b4613b5af70 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -327,6 +327,14 @@ enum enum { LINUX_MIB_TLSNUM = 0, + LINUX_MIB_TLSCURRTXSW, /* TlsCurrTxSw */ + LINUX_MIB_TLSCURRRXSW, /* TlsCurrRxSw */ + LINUX_MIB_TLSCURRTXDEVICE, /* TlsCurrTxDevice */ + LINUX_MIB_TLSCURRRXDEVICE, /* TlsCurrRxDevice */ + LINUX_MIB_TLSTXSW, /* TlsTxSw */ + LINUX_MIB_TLSRXSW, /* TlsRxSw */ + LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ + LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From 5c5ec66858062a857cf51f57cbe52b36330f7ae6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:26 -0700 Subject: net/tls: add TlsDecryptError stat Add a statistic for TLS record decryption errors. Since devices are supposed to pass records as-is when they encounter errors this statistic will count bad records in both pure software and inline crypto configurations. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 1b4613b5af70..c9e4963e26f0 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -335,6 +335,7 @@ enum LINUX_MIB_TLSRXSW, /* TlsRxSw */ LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ + LINUX_MIB_TLSDECRYPTERROR, /* TlsDecryptError */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From a4d26fdbc2a5414bb1b67198656cc7e24a4a3c3a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:27 -0700 Subject: net/tls: add TlsDeviceRxResync statistic Add a statistic for number of RX resyncs sent down to the NIC. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index c9e4963e26f0..7eee233e78d2 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -336,6 +336,7 @@ enum LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ LINUX_MIB_TLSDECRYPTERROR, /* TlsDecryptError */ + LINUX_MIB_TLSRXDEVICERESYNC, /* TlsRxDeviceResync */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From 8273fd845447820c26b38821c8ac297f40a65260 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 5 Oct 2019 08:10:31 +0200 Subject: net: devlink: export devlink net setter For newly allocated devlink instance allow drivers to set net struct Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 3c9d4a063c98..4095657fc23f 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -39,6 +39,7 @@ struct devlink { possible_net_t _net; struct mutex lock; bool reload_failed; + bool registered; char priv[0] __aligned(NETDEV_ALIGN); }; @@ -772,6 +773,7 @@ static inline struct devlink *netdev_to_devlink(struct net_device *dev) struct ib_device; struct net *devlink_net(const struct devlink *devlink); +void devlink_net_set(struct devlink *devlink, struct net *net); struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); int devlink_register(struct devlink *devlink, struct device *dev); void devlink_unregister(struct devlink *devlink); -- cgit v1.2.3 From 1927f41a22a05e3bc178fa47f7ce7be271fbc541 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 5 Oct 2019 20:04:34 +0200 Subject: net: genetlink: introduce dump info struct to be available during dumpit op Currently the cb->data is taken by ops during non-parallel dumping. Introduce a new structure genl_dumpit_info and store the ops there. Distribute the info to both non-parallel and parallel dumping. Also add a helper genl_dumpit_info() to easily get the info structure in the dumpit callback from cb. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/genetlink.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 9292f1c588b7..fb838f4b0089 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -127,6 +127,20 @@ enum genl_validate_flags { GENL_DONT_VALIDATE_DUMP_STRICT = BIT(2), }; +/** + * struct genl_info - info that is available during dumpit op call + * @ops: generic netlink ops - for internal genl code usage + */ +struct genl_dumpit_info { + const struct genl_ops *ops; +}; + +static inline const struct genl_dumpit_info * +genl_dumpit_info(struct netlink_callback *cb) +{ + return cb->data; +} + /** * struct genl_ops - generic netlink operations * @cmd: command identifier -- cgit v1.2.3 From bf813b0afeae2f012f0e527a526c1b78ca21ad82 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 5 Oct 2019 20:04:36 +0200 Subject: net: genetlink: parse attrs and store in contect info struct during dumpit Extend the dumpit info struct for attrs. Instead of existing attribute validation do parse them and save in the info struct. Caller can benefit from this and does not have to do parse itself. In order to properly free attrs, genl_family pointer needs to be added to dumpit info struct as well. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/genetlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index fb838f4b0089..922dcc9348b1 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -129,10 +129,14 @@ enum genl_validate_flags { /** * struct genl_info - info that is available during dumpit op call + * @family: generic netlink family - for internal genl code usage * @ops: generic netlink ops - for internal genl code usage + * @attrs: netlink attributes */ struct genl_dumpit_info { + const struct genl_family *family; const struct genl_ops *ops; + struct nlattr **attrs; }; static inline const struct genl_dumpit_info * -- cgit v1.2.3 From 265ecd4fa3f0ca43909f8b2cc0e519966f21b167 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 5 Oct 2019 20:04:41 +0200 Subject: net: genetlink: remove unused genl_family_attrbuf() genl_family_attrbuf() function is no longer used by anyone, so remove it. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/genetlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 922dcc9348b1..74950663bb00 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -75,8 +75,6 @@ struct genl_family { struct module *module; }; -struct nlattr **genl_family_attrbuf(const struct genl_family *family); - /** * struct genl_info - receiving information * @snd_seq: sending sequence number -- cgit v1.2.3 From 5f0e5412781b01708f622d00c0b3f77b9dca7367 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 6 Oct 2019 20:07:36 -0700 Subject: uapi/bpf: fix helper docs Various small fixes to BPF helper documentation comments, enabling automatic header generation with a list of BPF helpers. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77c6be96d676..a65c3b0c6935 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -794,7 +794,7 @@ union bpf_attr { * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(char *buf, u32 size_of_buf) + * int bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of @@ -1023,7 +1023,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1068,7 +1068,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) + * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from @@ -1085,7 +1085,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags) + * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1154,7 +1154,7 @@ union bpf_attr { * The checksum result, or a negative error code in case of * failure. * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* @@ -1172,7 +1172,7 @@ union bpf_attr { * Return * The size of the option data retrieved. * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. @@ -1511,7 +1511,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,7 +1595,7 @@ union bpf_attr { * Return * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. * - * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and @@ -1715,7 +1715,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1947,7 +1947,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) + * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer @@ -1980,7 +1980,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) + * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -2033,7 +2033,7 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * - * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to @@ -2392,7 +2392,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) + * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. @@ -2408,9 +2408,9 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags) + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description - * Will remove *pop* bytes from a *msg* starting at byte *start*. + * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if * an allocation and copy are required due to a full ring buffer. * However, the helper will try to avoid doing the allocation @@ -2505,7 +2505,7 @@ union bpf_attr { * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * - * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * int bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** -- cgit v1.2.3 From b9df4fd7e99cb8bfd80c4143f3045d63b1754ad0 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 6 Oct 2019 18:19:54 +0200 Subject: net: core: change return type of pskb_may_pull to bool This function de-facto returns a bool, so let's change the return type accordingly. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4351577b14d7..0a58402a166e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2261,12 +2261,12 @@ static inline void *pskb_pull(struct sk_buff *skb, unsigned int len) return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len); } -static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len) +static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len) { if (likely(len <= skb_headlen(skb))) - return 1; + return true; if (unlikely(len > skb->len)) - return 0; + return false; return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL; } -- cgit v1.2.3 From 328908621081c3c7455c39549c5334e74b7c525a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 7 Oct 2019 09:37:27 -0400 Subject: ipv6: Make ipv6_mc_may_pull() return bool. Consistent with how pskb_may_pull() also now does so. Signed-off-by: David S. Miller --- include/net/addrconf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 3f62b347b04a..1bab88184d3c 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -202,11 +202,11 @@ u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, /* * multicast prototypes (mcast.c) */ -static inline int ipv6_mc_may_pull(struct sk_buff *skb, - unsigned int len) +static inline bool ipv6_mc_may_pull(struct sk_buff *skb, + unsigned int len) { if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len) - return 0; + return false; return pskb_may_pull(skb, len); } -- cgit v1.2.3 From 163ab96b52ae2bb2d8f188cd29f0b570610f9007 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 6 Oct 2019 21:09:27 -0700 Subject: net: sockmap: use bitmap for copy info Don't use bool array in struct sk_msg_sg, save 12 bytes. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- include/linux/skmsg.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e4b3fb4bb77c..fe80d537945d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -28,13 +28,14 @@ struct sk_msg_sg { u32 end; u32 size; u32 copybreak; - bool copy[MAX_MSG_FRAGS]; + unsigned long copy; /* The extra element is used for chaining the front and sections when * the list becomes partitioned (e.g. end < start). The crypto APIs * require the chaining. */ struct scatterlist data[MAX_MSG_FRAGS + 1]; }; +static_assert(BITS_PER_LONG >= MAX_MSG_FRAGS); /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { @@ -227,7 +228,7 @@ static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); - if (msg->sg.copy[msg->sg.start]) { + if (test_bit(msg->sg.start, &msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { @@ -246,7 +247,7 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, sg_set_page(sge, page, len, offset); sg_unmark_end(sge); - msg->sg.copy[msg->sg.end] = true; + __set_bit(msg->sg.end, &msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } @@ -254,7 +255,10 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { - msg->sg.copy[i] = copy_state; + if (copy_state) + __set_bit(i, &msg->sg.copy); + else + __clear_bit(i, &msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; -- cgit v1.2.3 From 4de30a8d58c90e18140342cdcb74903d2e4fbb62 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 6 Oct 2019 21:09:30 -0700 Subject: net/tls: pass context to tls_device_decrypted() Avoid unnecessary pointer chasing and calculations, callers already have most of the state tls_device_decrypted() needs. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- include/net/tls.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 24c37bffc961..b809f2362049 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -641,7 +641,8 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); void tls_device_offload_cleanup_rx(struct sock *sk); void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq); void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq); -int tls_device_decrypted(struct sock *sk, struct sk_buff *skb); +int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, + struct sk_buff *skb, struct strp_msg *rxm); #else static inline void tls_device_init(void) {} static inline void tls_device_cleanup(void) {} @@ -664,7 +665,9 @@ static inline void tls_device_offload_cleanup_rx(struct sock *sk) {} static inline void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {} -static inline int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) +static inline int +tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx, + struct sk_buff *skb, struct strp_msg *rxm) { return 0; } -- cgit v1.2.3 From 5c5458ec9d631fbca29f53a944168265e18aa77a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 6 Oct 2019 21:09:31 -0700 Subject: net/tls: store async_capable on a single bit Store async_capable on a single bit instead of a full integer to save space. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index b809f2362049..97eae7271a67 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -136,7 +136,7 @@ struct tls_sw_context_tx { struct list_head tx_list; atomic_t encrypt_pending; int async_notify; - int async_capable; + u8 async_capable:1; #define BIT_TX_SCHEDULED 0 #define BIT_TX_CLOSING 1 @@ -152,7 +152,7 @@ struct tls_sw_context_rx { struct sk_buff *recv_pkt; u8 control; - int async_capable; + u8 async_capable:1; bool decrypted; atomic_t decrypt_pending; bool async_notify; -- cgit v1.2.3 From bc76e5bb1229ede1f26317b813099b0e983e4009 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 6 Oct 2019 21:09:32 -0700 Subject: net/tls: store decrypted on a single bit Use a single bit instead of boolean to remember if packet was already decrypted. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 97eae7271a67..41265e542e71 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -153,7 +153,7 @@ struct tls_sw_context_rx { struct sk_buff *recv_pkt; u8 control; u8 async_capable:1; - bool decrypted; + u8 decrypted:1; atomic_t decrypt_pending; bool async_notify; }; -- cgit v1.2.3 From 1d200e9d6f635ae894993a7d0f1b9e0b6e522e3b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:41 -0700 Subject: block: Fix writeback throttling W=1 compiler warnings Fix the following compiler warnings: In file included from ./include/linux/bitmap.h:9, from ./include/linux/cpumask.h:12, from ./arch/x86/include/asm/cpumask.h:5, from ./arch/x86/include/asm/msr.h:11, from ./arch/x86/include/asm/processor.h:21, from ./arch/x86/include/asm/cpufeature.h:5, from ./arch/x86/include/asm/thread_info.h:53, from ./include/linux/thread_info.h:38, from ./arch/x86/include/asm/preempt.h:7, from ./include/linux/preempt.h:78, from ./include/linux/spinlock.h:51, from ./include/linux/mmzone.h:8, from ./include/linux/gfp.h:6, from ./include/linux/mm.h:10, from ./include/linux/bvec.h:13, from ./include/linux/blk_types.h:10, from block/blk-wbt.c:23: In function 'strncpy', inlined from 'perf_trace_wbt_stat' at ./include/trace/events/wbt.h:15:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'perf_trace_wbt_lat' at ./include/trace/events/wbt.h:58:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'perf_trace_wbt_step' at ./include/trace/events/wbt.h:87:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'perf_trace_wbt_timer' at ./include/trace/events/wbt.h:126:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_stat' at ./include/trace/events/wbt.h:15:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_lat' at ./include/trace/events/wbt.h:58:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_timer' at ./include/trace/events/wbt.h:126:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_step' at ./include/trace/events/wbt.h:87:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Fixes: e34cbd307477 ("blk-wbt: add general throttling mechanism"; v4.10). Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/trace/events/wbt.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h index b048694070e2..37342a13c9cb 100644 --- a/include/trace/events/wbt.h +++ b/include/trace/events/wbt.h @@ -33,7 +33,8 @@ TRACE_EVENT(wbt_stat, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->rmean = stat[0].mean; __entry->rmin = stat[0].min; __entry->rmax = stat[0].max; @@ -67,7 +68,8 @@ TRACE_EVENT(wbt_lat, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->lat = div_u64(lat, 1000); ), @@ -103,7 +105,8 @@ TRACE_EVENT(wbt_step, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->msg = msg; __entry->step = step; __entry->window = div_u64(window, 1000); @@ -138,7 +141,8 @@ TRACE_EVENT(wbt_timer, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->status = status; __entry->step = step; __entry->inflight = inflight; -- cgit v1.2.3 From 9566256518de0520c964bdf23140eac324b136af Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:42 -0700 Subject: block: Remove request_queue.nr_queues Commit 897bb0c7f1ea ("blk-mq: Use proper cpumask iterator"; v4.6) removed the last use of request_queue.nr_queues from outside blk_mq_init_allocate_queue(). Remove this member variable to make struct request_queue smaller. This patch does not change any functionality. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f3ea78b0c91c..d4051acb92a1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -411,7 +411,6 @@ struct request_queue { /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; - unsigned int nr_queues; unsigned int queue_depth; -- cgit v1.2.3 From 7a18312c739aeace7c8ea448d39a0313d5ad5d5d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:45 -0700 Subject: block: Document all members of blk_mq_tag_set and bkl_mq_queue_map The meaning of several member variables of these two data structures is nontrivial. Hence document all member variables using the kernel-doc syntax. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 54 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0bf056de5cc3..a96b5cc957ab 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -76,6 +76,16 @@ struct blk_mq_hw_ctx { struct srcu_struct srcu[0]; }; +/** + * struct blk_mq_queue_map - ctx -> hctx mapping + * @mq_map: CPU ID to hardware queue index map. This is an array + * with nr_cpu_ids elements. Each element has a value in the range + * [@queue_offset, @queue_offset + @nr_queues). + * @nr_queues: Number of hardware queues to map CPU IDs onto. + * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe + * driver to map each hardware queue type (enum hctx_type) onto a distinct + * set of hardware queues. + */ struct blk_mq_queue_map { unsigned int *mq_map; unsigned int nr_queues; @@ -90,23 +100,45 @@ enum hctx_type { HCTX_MAX_TYPES, }; +/** + * struct blk_mq_tag_set - tag set that can be shared between request queues + * @map: One or more ctx -> hctx mappings. One map exists for each + * hardware queue type (enum hctx_type) that the driver wishes + * to support. There are no restrictions on maps being of the + * same size, and it's perfectly legal to share maps between + * types. + * @nr_maps: Number of elements in the @map array. A number in the range + * [1, HCTX_MAX_TYPES]. + * @ops: Pointers to functions that implement block driver behavior. + * @nr_hw_queues: Number of hardware queues supported by the block driver that + * owns this data structure. + * @queue_depth: Number of tags per hardware queue, reserved tags included. + * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag + * allocations. + * @cmd_size: Number of additional bytes to allocate per request. The block + * driver owns these additional bytes. + * @numa_node: NUMA node the storage adapter has been connected to. + * @timeout: Request processing timeout in jiffies. + * @flags: Zero or more BLK_MQ_F_* flags. + * @driver_data: Pointer to data owned by the block driver that created this + * tag set. + * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues + * elements. + * @tag_list_lock: Serializes tag_list accesses. + * @tag_list: List of the request queues that use this tag set. See also + * request_queue.tag_set_list. + */ struct blk_mq_tag_set { - /* - * map[] holds ctx -> hctx mappings, one map exists for each type - * that the driver wishes to support. There are no restrictions - * on maps being of the same size, and it's perfectly legal to - * share maps between types. - */ struct blk_mq_queue_map map[HCTX_MAX_TYPES]; - unsigned int nr_maps; /* nr entries in map[] */ + unsigned int nr_maps; const struct blk_mq_ops *ops; - unsigned int nr_hw_queues; /* nr hw queues across maps */ - unsigned int queue_depth; /* max hw supported */ + unsigned int nr_hw_queues; + unsigned int queue_depth; unsigned int reserved_tags; - unsigned int cmd_size; /* per-request extra data */ + unsigned int cmd_size; int numa_node; unsigned int timeout; - unsigned int flags; /* BLK_MQ_F_* */ + unsigned int flags; void *driver_data; struct blk_mq_tags **tags; -- cgit v1.2.3 From 27a46989a82c71028f2ba15a3f2c8f30451fda33 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Sep 2019 11:25:49 +0300 Subject: blk-mq: Inline status checkers blk_mq_request_completed() and blk_mq_request_started() are short, inline it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a96b5cc957ab..e0fce93ac127 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -333,9 +333,25 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; } +/** + * blk_mq_rq_state() - read the current MQ_RQ_* state of a request + * @rq: target request. + */ +static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) +{ + return READ_ONCE(rq->state); +} + +static inline int blk_mq_request_started(struct request *rq) +{ + return blk_mq_rq_state(rq) != MQ_RQ_IDLE; +} + +static inline int blk_mq_request_completed(struct request *rq) +{ + return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; +} -int blk_mq_request_started(struct request *rq); -int blk_mq_request_completed(struct request *rq); void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); -- cgit v1.2.3 From 8c2a2b8c2ff680b1d0f715cf1f320722b762f9dd Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Thu, 3 Oct 2019 11:52:29 +0200 Subject: nvmem: core: add nvmem_device_find nvmem_device_find provides a way to search for nvmem devices with the help of a match function simlair to bus_find_device. Reviewed-by: Srinivas Kandagatla Acked-by: Srinivas Kandagatla Signed-off-by: Thomas Bogendoerfer Signed-off-by: Paul Burton Cc: Jonathan Corbet Cc: Ralf Baechle Cc: James Hogan Cc: Lee Jones Cc: David S. Miller Cc: Alessandro Zummo Cc: Alexandre Belloni Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-rtc@vger.kernel.org Cc: linux-serial@vger.kernel.org --- include/linux/nvmem-consumer.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h index 8f8be5b00060..02dc4aa992b2 100644 --- a/include/linux/nvmem-consumer.h +++ b/include/linux/nvmem-consumer.h @@ -89,6 +89,9 @@ void nvmem_del_cell_lookups(struct nvmem_cell_lookup *entries, int nvmem_register_notifier(struct notifier_block *nb); int nvmem_unregister_notifier(struct notifier_block *nb); +struct nvmem_device *nvmem_device_find(void *data, + int (*match)(struct device *dev, const void *data)); + #else static inline struct nvmem_cell *nvmem_cell_get(struct device *dev, @@ -204,6 +207,12 @@ static inline int nvmem_unregister_notifier(struct notifier_block *nb) return -EOPNOTSUPP; } +static inline struct nvmem_device *nvmem_device_find(void *data, + int (*match)(struct device *dev, const void *data)) +{ + return NULL; +} + #endif /* CONFIG_NVMEM */ #if IS_ENABLED(CONFIG_NVMEM) && IS_ENABLED(CONFIG_OF) -- cgit v1.2.3 From b42faeee718ce13ef6eb99c24880b58deb54c8fa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 5 Sep 2019 04:01:12 +0300 Subject: spi: Add a PTP system timestamp to the transfer structure SPI is one of the interfaces used to access devices which have a POSIX clock driver (real time clocks, 1588 timers etc). The fact that the SPI bus is slow is not what the main problem is, but rather the fact that drivers don't take a constant amount of time in transferring data over SPI. When there is a high delay in the readout of time, there will be uncertainty in the value that has been read out of the peripheral. When that delay is constant, the uncertainty can at least be approximated with a certain accuracy which is fine more often than not. Timing jitter occurs all over in the kernel code, and is mainly caused by having to let go of the CPU for various reasons such as preemption, servicing interrupts, going to sleep, etc. Another major reason is CPU dynamic frequency scaling. It turns out that the problem of retrieving time from a SPI peripheral with high accuracy can be solved by the use of "PTP system timestamping" - a mechanism to correlate the time when the device has snapshotted its internal time counter with the Linux system time at that same moment. This is sufficient for having a precise time measurement - it is not necessary for the whole SPI transfer to be transmitted "as fast as possible", or "as low-jitter as possible". The system has to be low-jitter for a very short amount of time to be effective. This patch introduces a PTP system timestamping mechanism in struct spi_transfer. This is to be used by SPI device drivers when they need to know the exact time at which the underlying device's time was snapshotted. More often than not, SPI peripherals have a very exact timing for when their SPI-to-interconnect bridge issues a transaction for snapshotting and reading the time register, and that will be dependent on when the SPI-to-interconnect bridge figures out that this is what it should do, aka as soon as it sees byte N of the SPI transfer. Since spi_device drivers are the ones who'd know best how the peripheral behaves in this regard, expose a mechanism in spi_transfer which allows them to specify which word (or word range) from the transfer should be timestamped. Add a default implementation of the PTP system timestamping in the SPI core. This is not going to be satisfactory performance-wise, but should at least increase the likelihood that SPI device drivers will use PTP system timestamping in the future. There are 3 entry points from the core towards the SPI controller drivers: - transfer_one: The driver is passed individual spi_transfers to execute. This is the easiest to timestamp. - transfer_one_message: The core passes the driver an entire spi_message (a potential batch of spi_transfers). The core puts the same pre and post timestamp to all transfers within a message. This is not ideal, but nothing better can be done by default anyway, since the core has no insight into how the driver batches the transfers. - transfer: Like transfer_one_message, but for unqueued drivers (i.e. the driver implements its own queue scheduling). Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20190905010114.26718-3-olteanv@gmail.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 61 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index af4f265d0f67..27f6b046cf92 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -13,6 +13,7 @@ #include #include #include +#include struct dma_chan; struct property_entry; @@ -409,6 +410,12 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * @fw_translate_cs: If the boot firmware uses different numbering scheme * what Linux expects, this optional hook can be used to translate * between the two. + * @ptp_sts_supported: If the driver sets this to true, it must provide a + * time snapshot in @spi_transfer->ptp_sts as close as possible to the + * moment in time when @spi_transfer->ptp_sts_word_pre and + * @spi_transfer->ptp_sts_word_post were transmitted. + * If the driver does not set this, the SPI core takes the snapshot as + * close to the driver hand-over as possible. * * Each SPI controller can communicate with one or more @spi_device * children. These make a small bus, sharing MOSI, MISO and SCK signals @@ -604,6 +611,15 @@ struct spi_controller { void *dummy_tx; int (*fw_translate_cs)(struct spi_controller *ctlr, unsigned cs); + + /* + * Driver sets this field to indicate it is able to snapshot SPI + * transfers (needed e.g. for reading the time of POSIX clocks) + */ + bool ptp_sts_supported; + + /* Interrupt enable state during PTP system timestamping */ + unsigned long irq_flags; }; static inline void *spi_controller_get_devdata(struct spi_controller *ctlr) @@ -644,6 +660,14 @@ extern struct spi_message *spi_get_next_queued_message(struct spi_controller *ct extern void spi_finalize_current_message(struct spi_controller *ctlr); extern void spi_finalize_current_transfer(struct spi_controller *ctlr); +/* Helper calls for driver to timestamp transfer */ +void spi_take_timestamp_pre(struct spi_controller *ctlr, + struct spi_transfer *xfer, + const void *tx, bool irqs_off); +void spi_take_timestamp_post(struct spi_controller *ctlr, + struct spi_transfer *xfer, + const void *tx, bool irqs_off); + /* the spi driver core manages memory for the spi_controller classdev */ extern struct spi_controller *__spi_alloc_controller(struct device *host, unsigned int size, bool slave); @@ -753,6 +777,35 @@ extern void spi_res_release(struct spi_controller *ctlr, * @transfer_list: transfers are sequenced through @spi_message.transfers * @tx_sg: Scatterlist for transmit, currently not for client use * @rx_sg: Scatterlist for receive, currently not for client use + * @ptp_sts_word_pre: The word (subject to bits_per_word semantics) offset + * within @tx_buf for which the SPI device is requesting that the time + * snapshot for this transfer begins. Upon completing the SPI transfer, + * this value may have changed compared to what was requested, depending + * on the available snapshotting resolution (DMA transfer, + * @ptp_sts_supported is false, etc). + * @ptp_sts_word_post: See @ptp_sts_word_post. The two can be equal (meaning + * that a single byte should be snapshotted). + * If the core takes care of the timestamp (if @ptp_sts_supported is false + * for this controller), it will set @ptp_sts_word_pre to 0, and + * @ptp_sts_word_post to the length of the transfer. This is done + * purposefully (instead of setting to spi_transfer->len - 1) to denote + * that a transfer-level snapshot taken from within the driver may still + * be of higher quality. + * @ptp_sts: Pointer to a memory location held by the SPI slave device where a + * PTP system timestamp structure may lie. If drivers use PIO or their + * hardware has some sort of assist for retrieving exact transfer timing, + * they can (and should) assert @ptp_sts_supported and populate this + * structure using the ptp_read_system_*ts helper functions. + * The timestamp must represent the time at which the SPI slave device has + * processed the word, i.e. the "pre" timestamp should be taken before + * transmitting the "pre" word, and the "post" timestamp after receiving + * transmit confirmation from the controller for the "post" word. + * @timestamped_pre: Set by the SPI controller driver to denote it has acted + * upon the @ptp_sts request. Not set when the SPI core has taken care of + * the task. SPI device drivers are free to print a warning if this comes + * back unset and they need the better resolution. + * @timestamped_post: See above. The reason why both exist is that these + * booleans are also used to keep state in the core SPI logic. * * SPI transfers always write the same number of bytes as they read. * Protocol drivers should always provide @rx_buf and/or @tx_buf. @@ -842,6 +895,14 @@ struct spi_transfer { u32 effective_speed_hz; + unsigned int ptp_sts_word_pre; + unsigned int ptp_sts_word_post; + + struct ptp_system_timestamp *ptp_sts; + + bool timestamped_pre; + bool timestamped_post; + struct list_head transfer_list; }; -- cgit v1.2.3 From 017f77c050a3bc1f1ff877d1f265beeee26d7dea Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:01 +0100 Subject: netfilter: ipset: add a coding-style fix to ip_set_ext_destroy. Use a local variable to hold comment in order to align the arguments of ip_set_comment_free properly. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 9bc255a8461b..9fee4837d02c 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -269,9 +269,11 @@ ip_set_ext_destroy(struct ip_set *set, void *data) /* Check that the extension is enabled for the set and * call it's destroy function for its extension part in data. */ - if (SET_WITH_COMMENT(set)) - ip_set_extensions[IPSET_EXT_ID_COMMENT].destroy( - set, ext_comment(data, set)); + if (SET_WITH_COMMENT(set)) { + struct ip_set_comment *c = ext_comment(data, set); + + ip_set_extensions[IPSET_EXT_ID_COMMENT].destroy(set, c); + } } static inline int -- cgit v1.2.3 From 94177f6e11c74b6ca3bcf7f65d3d74f00bbd6a8c Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:03 +0100 Subject: netfilter: ipset: move ip_set_comment functions from ip_set.h to ip_set_core.c. Most of the functions are only called from within ip_set_core.c. The exception is ip_set_init_comment. However, this is too complex to be a good candidate for a static inline function. Move it to ip_set_core.c, change its linkage to extern and export it, leaving a declaration in ip_set.h. ip_set_comment_free is only used as an extension destructor, so change its prototype to match and drop cast. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 63 ++-------------------------------- 1 file changed, 2 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 9fee4837d02c..985c9bb1ab65 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -521,67 +521,8 @@ ip_set_timeout_get(const unsigned long *timeout) return t == 0 ? 1 : t; } -static inline char* -ip_set_comment_uget(struct nlattr *tb) -{ - return nla_data(tb); -} - -/* Called from uadd only, protected by the set spinlock. - * The kadt functions don't use the comment extensions in any way. - */ -static inline void -ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, - const struct ip_set_ext *ext) -{ - struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); - size_t len = ext->comment ? strlen(ext->comment) : 0; - - if (unlikely(c)) { - set->ext_size -= sizeof(*c) + strlen(c->str) + 1; - kfree_rcu(c, rcu); - rcu_assign_pointer(comment->c, NULL); - } - if (!len) - return; - if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) - len = IPSET_MAX_COMMENT_SIZE; - c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); - if (unlikely(!c)) - return; - strlcpy(c->str, ext->comment, len + 1); - set->ext_size += sizeof(*c) + strlen(c->str) + 1; - rcu_assign_pointer(comment->c, c); -} - -/* Used only when dumping a set, protected by rcu_read_lock() */ -static inline int -ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment) -{ - struct ip_set_comment_rcu *c = rcu_dereference(comment->c); - - if (!c) - return 0; - return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); -} - -/* Called from uadd/udel, flush or the garbage collectors protected - * by the set spinlock. - * Called when the set is destroyed and when there can't be any user - * of the set data anymore. - */ -static inline void -ip_set_comment_free(struct ip_set *set, struct ip_set_comment *comment) -{ - struct ip_set_comment_rcu *c; - - c = rcu_dereference_protected(comment->c, 1); - if (unlikely(!c)) - return; - set->ext_size -= sizeof(*c) + strlen(c->str) + 1; - kfree_rcu(c, rcu); - rcu_assign_pointer(comment->c, NULL); -} +void ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, + const struct ip_set_ext *ext); static inline void ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) -- cgit v1.2.3 From 2398a97688f1aaca09d0a5a809f361e2abf5ff3c Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:04 +0100 Subject: netfilter: ipset: move functions to ip_set_core.c. Several inline functions in ip_set.h are only called in ip_set_core.c: move them and remove inline function specifier. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 102 --------------------------------- 1 file changed, 102 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 985c9bb1ab65..44f6de8a1733 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -508,86 +508,9 @@ ip_set_timeout_set(unsigned long *timeout, u32 value) *timeout = t; } -static inline u32 -ip_set_timeout_get(const unsigned long *timeout) -{ - u32 t; - - if (*timeout == IPSET_ELEM_PERMANENT) - return 0; - - t = jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC; - /* Zero value in userspace means no timeout */ - return t == 0 ? 1 : t; -} - void ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, const struct ip_set_ext *ext); -static inline void -ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter) -{ - atomic64_add((long long)bytes, &(counter)->bytes); -} - -static inline void -ip_set_add_packets(u64 packets, struct ip_set_counter *counter) -{ - atomic64_add((long long)packets, &(counter)->packets); -} - -static inline u64 -ip_set_get_bytes(const struct ip_set_counter *counter) -{ - return (u64)atomic64_read(&(counter)->bytes); -} - -static inline u64 -ip_set_get_packets(const struct ip_set_counter *counter) -{ - return (u64)atomic64_read(&(counter)->packets); -} - -static inline bool -ip_set_match_counter(u64 counter, u64 match, u8 op) -{ - switch (op) { - case IPSET_COUNTER_NONE: - return true; - case IPSET_COUNTER_EQ: - return counter == match; - case IPSET_COUNTER_NE: - return counter != match; - case IPSET_COUNTER_LT: - return counter < match; - case IPSET_COUNTER_GT: - return counter > match; - } - return false; -} - -static inline void -ip_set_update_counter(struct ip_set_counter *counter, - const struct ip_set_ext *ext, u32 flags) -{ - if (ext->packets != ULLONG_MAX && - !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) { - ip_set_add_bytes(ext->bytes, counter); - ip_set_add_packets(ext->packets, counter); - } -} - -static inline bool -ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter) -{ - return nla_put_net64(skb, IPSET_ATTR_BYTES, - cpu_to_be64(ip_set_get_bytes(counter)), - IPSET_ATTR_PAD) || - nla_put_net64(skb, IPSET_ATTR_PACKETS, - cpu_to_be64(ip_set_get_packets(counter)), - IPSET_ATTR_PAD); -} - static inline void ip_set_init_counter(struct ip_set_counter *counter, const struct ip_set_ext *ext) @@ -598,31 +521,6 @@ ip_set_init_counter(struct ip_set_counter *counter, atomic64_set(&(counter)->packets, (long long)(ext->packets)); } -static inline void -ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, - const struct ip_set_ext *ext, - struct ip_set_ext *mext, u32 flags) -{ - mext->skbinfo = *skbinfo; -} - -static inline bool -ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo) -{ - /* Send nonzero parameters only */ - return ((skbinfo->skbmark || skbinfo->skbmarkmask) && - nla_put_net64(skb, IPSET_ATTR_SKBMARK, - cpu_to_be64((u64)skbinfo->skbmark << 32 | - skbinfo->skbmarkmask), - IPSET_ATTR_PAD)) || - (skbinfo->skbprio && - nla_put_net32(skb, IPSET_ATTR_SKBPRIO, - cpu_to_be32(skbinfo->skbprio))) || - (skbinfo->skbqueue && - nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, - cpu_to_be16(skbinfo->skbqueue))); -} - static inline void ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, const struct ip_set_ext *ext) -- cgit v1.2.3 From 856391854ce73015fbe2b235f5886205aab166b0 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:05 +0100 Subject: netfilter: ipset: make ip_set_put_flags extern. ip_set_put_flags is rather large for a static inline function in a header-file. Move it to ip_set_core.c and export it. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 44f6de8a1733..4d8b1eaf7708 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -276,28 +276,7 @@ ip_set_ext_destroy(struct ip_set *set, void *data) } } -static inline int -ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) -{ - u32 cadt_flags = 0; - - if (SET_WITH_TIMEOUT(set)) - if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT, - htonl(set->timeout)))) - return -EMSGSIZE; - if (SET_WITH_COUNTER(set)) - cadt_flags |= IPSET_FLAG_WITH_COUNTERS; - if (SET_WITH_COMMENT(set)) - cadt_flags |= IPSET_FLAG_WITH_COMMENT; - if (SET_WITH_SKBINFO(set)) - cadt_flags |= IPSET_FLAG_WITH_SKBINFO; - if (SET_WITH_FORCEADD(set)) - cadt_flags |= IPSET_FLAG_WITH_FORCEADD; - - if (!cadt_flags) - return 0; - return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags)); -} +int ip_set_put_flags(struct sk_buff *skb, struct ip_set *set); /* Netlink CB args */ enum { -- cgit v1.2.3 From 3fbd6c4513b5c27465a1dcf2e4286e6c3183bb1f Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:06 +0100 Subject: netfilter: ipset: move function to ip_set_bitmap_ip.c. One inline function in ip_set_bitmap.h is only called in ip_set_bitmap_ip.c: move it and remove inline function specifier. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set_bitmap.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set_bitmap.h b/include/linux/netfilter/ipset/ip_set_bitmap.h index 2dddbc6dcac7..fcc4d214a788 100644 --- a/include/linux/netfilter/ipset/ip_set_bitmap.h +++ b/include/linux/netfilter/ipset/ip_set_bitmap.h @@ -12,18 +12,4 @@ enum { IPSET_ADD_START_STORED_TIMEOUT, }; -/* Common functions */ - -static inline u32 -range_to_mask(u32 from, u32 to, u8 *bits) -{ - u32 mask = 0xFFFFFFFE; - - *bits = 32; - while (--(*bits) > 0 && mask && (to & mask) != from) - mask <<= 1; - - return mask; -} - #endif /* __IP_SET_BITMAP_H */ -- cgit v1.2.3 From f8615bf8a3dabd84bf844c6f888929495039d389 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 3 Oct 2019 20:56:07 +0100 Subject: netfilter: ipset: move ip_set_get_ip_port() to ip_set_bitmap_port.c. ip_set_get_ip_port() is only used in ip_set_bitmap_port.c. Move it there and make it static. Signed-off-by: Jeremy Sowden Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set_getport.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set_getport.h b/include/linux/netfilter/ipset/ip_set_getport.h index d74cd112b88a..1ecaabd9a048 100644 --- a/include/linux/netfilter/ipset/ip_set_getport.h +++ b/include/linux/netfilter/ipset/ip_set_getport.h @@ -20,9 +20,6 @@ static inline bool ip_set_get_ip6_port(const struct sk_buff *skb, bool src, } #endif -extern bool ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, - __be16 *port); - static inline bool ip_set_proto_with_ports(u8 proto) { switch (proto) { -- cgit v1.2.3 From 7d47433cf74f942a414171867d89c08640cfef45 Mon Sep 17 00:00:00 2001 From: Yamin Friedman Date: Mon, 7 Oct 2019 16:59:31 +0300 Subject: net/mlx5: Expose optimal performance scatter entries capability Expose maximum scatter entries per RDMA READ for optimal performance. Signed-off-by: Yamin Friedman Reviewed-by: Or Gerlitz Reviewed-by: Christoph Hellwig Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 138c50d5a353..c0bfb1d90dd2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1153,7 +1153,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_srq[0x5]; u8 reserved_at_b0[0x10]; - u8 reserved_at_c0[0x8]; + u8 max_sgl_for_optimized_performance[0x8]; u8 log_max_cq_sz[0x8]; u8 reserved_at_d0[0xb]; u8 log_max_cq[0x5]; -- cgit v1.2.3 From 5d5a0815f854a5b0e21d97e16cfadad69ce5fb04 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Fri, 27 Sep 2019 12:54:50 +0800 Subject: ipvs: batch __ip_vs_cleanup It's better to batch __ip_vs_cleanup to speedup ipvs connections dismantle. Signed-off-by: Haishuang Yan Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 3759167f91f5..93e7a252993d 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1324,7 +1324,7 @@ void ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_control_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs); void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs); -void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs); +void ip_vs_service_nets_cleanup(struct list_head *net_list); /* IPVS application functions * (from ip_vs_app.c) -- cgit v1.2.3 From 79591b7db21d255db158afaa48c557dcab631a1c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 5 Sep 2019 04:01:12 +0300 Subject: spi: Add a PTP system timestamp to the transfer structure SPI is one of the interfaces used to access devices which have a POSIX clock driver (real time clocks, 1588 timers etc). The fact that the SPI bus is slow is not what the main problem is, but rather the fact that drivers don't take a constant amount of time in transferring data over SPI. When there is a high delay in the readout of time, there will be uncertainty in the value that has been read out of the peripheral. When that delay is constant, the uncertainty can at least be approximated with a certain accuracy which is fine more often than not. Timing jitter occurs all over in the kernel code, and is mainly caused by having to let go of the CPU for various reasons such as preemption, servicing interrupts, going to sleep, etc. Another major reason is CPU dynamic frequency scaling. It turns out that the problem of retrieving time from a SPI peripheral with high accuracy can be solved by the use of "PTP system timestamping" - a mechanism to correlate the time when the device has snapshotted its internal time counter with the Linux system time at that same moment. This is sufficient for having a precise time measurement - it is not necessary for the whole SPI transfer to be transmitted "as fast as possible", or "as low-jitter as possible". The system has to be low-jitter for a very short amount of time to be effective. This patch introduces a PTP system timestamping mechanism in struct spi_transfer. This is to be used by SPI device drivers when they need to know the exact time at which the underlying device's time was snapshotted. More often than not, SPI peripherals have a very exact timing for when their SPI-to-interconnect bridge issues a transaction for snapshotting and reading the time register, and that will be dependent on when the SPI-to-interconnect bridge figures out that this is what it should do, aka as soon as it sees byte N of the SPI transfer. Since spi_device drivers are the ones who'd know best how the peripheral behaves in this regard, expose a mechanism in spi_transfer which allows them to specify which word (or word range) from the transfer should be timestamped. Add a default implementation of the PTP system timestamping in the SPI core. This is not going to be satisfactory performance-wise, but should at least increase the likelihood that SPI device drivers will use PTP system timestamping in the future. There are 3 entry points from the core towards the SPI controller drivers: - transfer_one: The driver is passed individual spi_transfers to execute. This is the easiest to timestamp. - transfer_one_message: The core passes the driver an entire spi_message (a potential batch of spi_transfers). The core puts the same pre and post timestamp to all transfers within a message. This is not ideal, but nothing better can be done by default anyway, since the core has no insight into how the driver batches the transfers. - transfer: Like transfer_one_message, but for unqueued drivers (i.e. the driver implements its own queue scheduling). Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20190905010114.26718-3-olteanv@gmail.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 61 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index af4f265d0f67..27f6b046cf92 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -13,6 +13,7 @@ #include #include #include +#include struct dma_chan; struct property_entry; @@ -409,6 +410,12 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * @fw_translate_cs: If the boot firmware uses different numbering scheme * what Linux expects, this optional hook can be used to translate * between the two. + * @ptp_sts_supported: If the driver sets this to true, it must provide a + * time snapshot in @spi_transfer->ptp_sts as close as possible to the + * moment in time when @spi_transfer->ptp_sts_word_pre and + * @spi_transfer->ptp_sts_word_post were transmitted. + * If the driver does not set this, the SPI core takes the snapshot as + * close to the driver hand-over as possible. * * Each SPI controller can communicate with one or more @spi_device * children. These make a small bus, sharing MOSI, MISO and SCK signals @@ -604,6 +611,15 @@ struct spi_controller { void *dummy_tx; int (*fw_translate_cs)(struct spi_controller *ctlr, unsigned cs); + + /* + * Driver sets this field to indicate it is able to snapshot SPI + * transfers (needed e.g. for reading the time of POSIX clocks) + */ + bool ptp_sts_supported; + + /* Interrupt enable state during PTP system timestamping */ + unsigned long irq_flags; }; static inline void *spi_controller_get_devdata(struct spi_controller *ctlr) @@ -644,6 +660,14 @@ extern struct spi_message *spi_get_next_queued_message(struct spi_controller *ct extern void spi_finalize_current_message(struct spi_controller *ctlr); extern void spi_finalize_current_transfer(struct spi_controller *ctlr); +/* Helper calls for driver to timestamp transfer */ +void spi_take_timestamp_pre(struct spi_controller *ctlr, + struct spi_transfer *xfer, + const void *tx, bool irqs_off); +void spi_take_timestamp_post(struct spi_controller *ctlr, + struct spi_transfer *xfer, + const void *tx, bool irqs_off); + /* the spi driver core manages memory for the spi_controller classdev */ extern struct spi_controller *__spi_alloc_controller(struct device *host, unsigned int size, bool slave); @@ -753,6 +777,35 @@ extern void spi_res_release(struct spi_controller *ctlr, * @transfer_list: transfers are sequenced through @spi_message.transfers * @tx_sg: Scatterlist for transmit, currently not for client use * @rx_sg: Scatterlist for receive, currently not for client use + * @ptp_sts_word_pre: The word (subject to bits_per_word semantics) offset + * within @tx_buf for which the SPI device is requesting that the time + * snapshot for this transfer begins. Upon completing the SPI transfer, + * this value may have changed compared to what was requested, depending + * on the available snapshotting resolution (DMA transfer, + * @ptp_sts_supported is false, etc). + * @ptp_sts_word_post: See @ptp_sts_word_post. The two can be equal (meaning + * that a single byte should be snapshotted). + * If the core takes care of the timestamp (if @ptp_sts_supported is false + * for this controller), it will set @ptp_sts_word_pre to 0, and + * @ptp_sts_word_post to the length of the transfer. This is done + * purposefully (instead of setting to spi_transfer->len - 1) to denote + * that a transfer-level snapshot taken from within the driver may still + * be of higher quality. + * @ptp_sts: Pointer to a memory location held by the SPI slave device where a + * PTP system timestamp structure may lie. If drivers use PIO or their + * hardware has some sort of assist for retrieving exact transfer timing, + * they can (and should) assert @ptp_sts_supported and populate this + * structure using the ptp_read_system_*ts helper functions. + * The timestamp must represent the time at which the SPI slave device has + * processed the word, i.e. the "pre" timestamp should be taken before + * transmitting the "pre" word, and the "post" timestamp after receiving + * transmit confirmation from the controller for the "post" word. + * @timestamped_pre: Set by the SPI controller driver to denote it has acted + * upon the @ptp_sts request. Not set when the SPI core has taken care of + * the task. SPI device drivers are free to print a warning if this comes + * back unset and they need the better resolution. + * @timestamped_post: See above. The reason why both exist is that these + * booleans are also used to keep state in the core SPI logic. * * SPI transfers always write the same number of bytes as they read. * Protocol drivers should always provide @rx_buf and/or @tx_buf. @@ -842,6 +895,14 @@ struct spi_transfer { u32 effective_speed_hz; + unsigned int ptp_sts_word_pre; + unsigned int ptp_sts_word_post; + + struct ptp_system_timestamp *ptp_sts; + + bool timestamped_pre; + bool timestamped_post; + struct list_head transfer_list; }; -- cgit v1.2.3 From bacb7e1855969bba78b32302453d2cc8ba0bc403 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2019 14:20:34 -0700 Subject: Revert "tun: call dev_get_valid_name() before register_netdevice()" This reverts commit 0ad646c81b2182f7fa67ec0c8c825e0ee165696d. As noticed by Jakub, this is no longer needed after commit 11fc7d5a0a2d ("tun: fix memory leak in error path") This no longer exports dev_get_valid_name() for the exclusive use of tun driver. Suggested-by: Jakub Kicinski Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fe45b2c72315..3207e0b9ec4e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4113,9 +4113,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs); -int dev_get_valid_name(struct net *net, struct net_device *dev, - const char *name); - #define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \ alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1) -- cgit v1.2.3 From fd1ac07f3f17fbbc2f08e3b43951bed937d86a7b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 4 Oct 2019 00:21:57 +0300 Subject: xfrm: ifdef setsockopt(UDP_ENCAP_ESPINUDP/UDP_ENCAP_ESPINUDP_NON_IKE) If IPsec is not configured, there is no reason to delay the inevitable. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index aa08a7a5f6ac..dda3c025452e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1613,13 +1613,6 @@ static inline int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optv { return -ENOPROTOOPT; } - -static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) -{ - /* should not happen */ - kfree_skb(skb); - return 0; -} #endif struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, -- cgit v1.2.3 From 4b7740324ed86aa4b02cef134da4b79078294d72 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Oct 2019 19:27:33 +0800 Subject: sctp: add SCTP_ADDR_ADDED event A helper sctp_ulpevent_nofity_peer_addr_change() will be extracted to make peer_addr_change event and enqueue it, and the helper will be called in sctp_assoc_add_peer() to send SCTP_ADDR_ADDED event. This event is described in rfc6458#section-6.1.2: SCTP_ADDR_ADDED: The address is now part of the association. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: Jakub Kicinski --- include/net/sctp/ulpevent.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index e1a92c4610f3..e6ead1ed74dd 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -80,13 +80,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( struct sctp_chunk *chunk, gfp_t gfp); -struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( - const struct sctp_association *asoc, - const struct sockaddr_storage *aaddr, - int flags, - int state, - int error, - gfp_t gfp); +void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, + int state, int error); struct sctp_ulpevent *sctp_ulpevent_make_remote_error( const struct sctp_association *asoc, -- cgit v1.2.3 From b6e6b5f1da7e8d092f86a4351802c27c0170c5a5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Oct 2019 19:27:36 +0800 Subject: sctp: add SCTP_SEND_FAILED_EVENT event This patch is to add a new event SCTP_SEND_FAILED_EVENT described in rfc6458#section-6.1.11. It's a update of SCTP_SEND_FAILED event: struct sctp_sndrcvinfo ssf_info is replaced with struct sctp_sndinfo ssfe_info in struct sctp_send_failed_event. SCTP_SEND_FAILED is being deprecated, but we don't remove it in this patch. Both are being processed in sctp_datamsg_destroy() when the corresp event flag is set. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: Jakub Kicinski --- include/net/sctp/ulpevent.h | 7 +++++++ include/uapi/linux/sctp.h | 16 +++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index e6ead1ed74dd..0b032b92da0b 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -95,6 +95,13 @@ struct sctp_ulpevent *sctp_ulpevent_make_send_failed( __u32 error, gfp_t gfp); +struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event( + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + __u16 flags, + __u32 error, + gfp_t gfp); + struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( const struct sctp_association *asoc, __u16 flags, diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6d5b164af55c..6bce7f9837a9 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -449,6 +449,16 @@ struct sctp_send_failed { __u8 ssf_data[0]; }; +struct sctp_send_failed_event { + __u16 ssf_type; + __u16 ssf_flags; + __u32 ssf_length; + __u32 ssf_error; + struct sctp_sndinfo ssfe_info; + sctp_assoc_t ssf_assoc_id; + __u8 ssf_data[0]; +}; + /* * ssf_flags: 16 bits (unsigned integer) * @@ -605,6 +615,7 @@ struct sctp_event_subscribe { __u8 sctp_stream_reset_event; __u8 sctp_assoc_reset_event; __u8 sctp_stream_change_event; + __u8 sctp_send_failure_event_event; }; /* @@ -632,6 +643,7 @@ union sctp_notification { struct sctp_stream_reset_event sn_strreset_event; struct sctp_assoc_reset_event sn_assocreset_event; struct sctp_stream_change_event sn_strchange_event; + struct sctp_send_failed_event sn_send_failed_event; }; /* Section 5.3.1 @@ -667,7 +679,9 @@ enum sctp_sn_type { #define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT SCTP_STREAM_CHANGE_EVENT, #define SCTP_STREAM_CHANGE_EVENT SCTP_STREAM_CHANGE_EVENT - SCTP_SN_TYPE_MAX = SCTP_STREAM_CHANGE_EVENT, + SCTP_SEND_FAILED_EVENT, +#define SCTP_SEND_FAILED_EVENT SCTP_SEND_FAILED_EVENT + SCTP_SN_TYPE_MAX = SCTP_SEND_FAILED_EVENT, #define SCTP_SN_TYPE_MAX SCTP_SN_TYPE_MAX }; -- cgit v1.2.3 From 690a6ca7df3de7b90546bc10a620d1ac8ccaa1a1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 8 Oct 2019 21:03:14 -0700 Subject: DIM: fix dim.h kernel-doc and headers Lots of fixes to kernel-doc in structs, enums, and functions. Also add header files that are being used but not yet #included. Signed-off-by: Randy Dunlap Cc: Yamin Friedman Cc: Tal Gilboa Cc: Saeed Mahameed Cc: Doug Ledford Cc: Jason Gunthorpe Cc: linux-rdma@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Jakub Kicinski --- include/linux/dim.h | 63 ++++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/dim.h b/include/linux/dim.h index 9fa4b3f88c39..b698266d0035 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -4,22 +4,26 @@ #ifndef DIM_H #define DIM_H +#include +#include #include +#include +#include -/** +/* * Number of events between DIM iterations. * Causes a moderation of the algorithm run. */ #define DIM_NEVENTS 64 -/** +/* * Is a difference between values justifies taking an action. * We consider 10% difference as significant. */ #define IS_SIGNIFICANT_DIFF(val, ref) \ (((100UL * abs((val) - (ref))) / (ref)) > 10) -/** +/* * Calculate the gap between two values. * Take wrap-around and variable size into consideration. */ @@ -27,12 +31,13 @@ & (BIT_ULL(bits) - 1)) /** - * Structure for CQ moderation values. + * struct dim_cq_moder - Structure for CQ moderation values. * Used for communications between DIM and its consumer. * * @usec: CQ timer suggestion (by DIM) * @pkts: CQ packet counter suggestion (by DIM) - * @cq_period_mode: CQ priod count mode (from CQE/EQE) + * @comps: Completion counter + * @cq_period_mode: CQ period count mode (from CQE/EQE) */ struct dim_cq_moder { u16 usec; @@ -42,13 +47,14 @@ struct dim_cq_moder { }; /** - * Structure for DIM sample data. + * struct dim_sample - Structure for DIM sample data. * Used for communications between DIM and its consumer. * * @time: Sample timestamp * @pkt_ctr: Number of packets * @byte_ctr: Number of bytes * @event_ctr: Number of events + * @comp_ctr: Current completion counter */ struct dim_sample { ktime_t time; @@ -59,12 +65,14 @@ struct dim_sample { }; /** - * Structure for DIM stats. + * struct dim_stats - Structure for DIM stats. * Used for holding current measured rates. * * @ppms: Packets per msec * @bpms: Bytes per msec * @epms: Events per msec + * @cpms: Completions per msec + * @cpe_ratio: Ratio of completions to events */ struct dim_stats { int ppms; /* packets per msec */ @@ -75,12 +83,13 @@ struct dim_stats { }; /** - * Main structure for dynamic interrupt moderation (DIM). + * struct dim - Main structure for dynamic interrupt moderation (DIM). * Used for holding all information about a specific DIM instance. * * @state: Algorithm state (see below) * @prev_stats: Measured rates from previous iteration (for comparison) * @start_sample: Sampled data at start of current iteration + * @measuring_sample: A &dim_sample that is used to update the current events * @work: Work to perform on action required * @priv: A pointer to the struct that points to dim * @profile_ix: Current moderation profile @@ -106,24 +115,21 @@ struct dim { }; /** - * enum dim_cq_period_mode - * - * These are the modes for CQ period count. + * enum dim_cq_period_mode - Modes for CQ period count * * @DIM_CQ_PERIOD_MODE_START_FROM_EQE: Start counting from EQE * @DIM_CQ_PERIOD_MODE_START_FROM_CQE: Start counting from CQE (implies timer reset) * @DIM_CQ_PERIOD_NUM_MODES: Number of modes */ -enum { +enum dim_cq_period_mode { DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0, DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1, DIM_CQ_PERIOD_NUM_MODES }; /** - * enum dim_state + * enum dim_state - DIM algorithm states * - * These are the DIM algorithm states. * These will determine if the algorithm is in a valid state to start an iteration. * * @DIM_START_MEASURE: This is the first iteration (also after applying a new profile) @@ -131,16 +137,15 @@ enum { * need to perform an action * @DIM_APPLY_NEW_PROFILE: DIM consumer is currently applying a profile - no need to measure */ -enum { +enum dim_state { DIM_START_MEASURE, DIM_MEASURE_IN_PROGRESS, DIM_APPLY_NEW_PROFILE, }; /** - * enum dim_tune_state + * enum dim_tune_state - DIM algorithm tune states * - * These are the DIM algorithm tune states. * These will determine which action the algorithm should perform. * * @DIM_PARKING_ON_TOP: Algorithm found a local top point - exit on significant difference @@ -148,7 +153,7 @@ enum { * @DIM_GOING_RIGHT: Algorithm is currently trying higher moderation levels * @DIM_GOING_LEFT: Algorithm is currently trying lower moderation levels */ -enum { +enum dim_tune_state { DIM_PARKING_ON_TOP, DIM_PARKING_TIRED, DIM_GOING_RIGHT, @@ -156,25 +161,23 @@ enum { }; /** - * enum dim_stats_state + * enum dim_stats_state - DIM algorithm statistics states * - * These are the DIM algorithm statistics states. * These will determine the verdict of current iteration. * * @DIM_STATS_WORSE: Current iteration shows worse performance than before - * @DIM_STATS_WORSE: Current iteration shows same performance than before - * @DIM_STATS_WORSE: Current iteration shows better performance than before + * @DIM_STATS_SAME: Current iteration shows same performance than before + * @DIM_STATS_BETTER: Current iteration shows better performance than before */ -enum { +enum dim_stats_state { DIM_STATS_WORSE, DIM_STATS_SAME, DIM_STATS_BETTER, }; /** - * enum dim_step_result + * enum dim_step_result - DIM algorithm step results * - * These are the DIM algorithm step results. * These describe the result of a step. * * @DIM_STEPPED: Performed a regular step @@ -182,7 +185,7 @@ enum { * tired parking * @DIM_ON_EDGE: Stepped to the most left/right profile */ -enum { +enum dim_step_result { DIM_STEPPED, DIM_TOO_TIRED, DIM_ON_EDGE, @@ -199,7 +202,7 @@ enum { bool dim_on_top(struct dim *dim); /** - * dim_turn - change profile alterning direction + * dim_turn - change profile altering direction * @dim: DIM context * * Go left if we were going right and vice-versa. @@ -238,7 +241,7 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, struct dim_stats *curr_stats); /** - * dim_update_sample - set a sample's fields with give values + * dim_update_sample - set a sample's fields with given values * @event_ctr: number of events to set * @packets: number of packets to set * @bytes: number of bytes to set @@ -304,8 +307,8 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode); * @end_sample: Current data measurement * * Called by the consumer. - * This is the main logic of the algorithm, where data is processed in order to decide on next - * required action. + * This is the main logic of the algorithm, where data is processed in order + * to decide on next required action. */ void net_dim(struct dim *dim, struct dim_sample end_sample); -- cgit v1.2.3 From a2351c5d86d7acf8eef17fba4ac1fc5b305a37c0 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 9 Oct 2019 10:07:43 +0200 Subject: net/smc: separate SMCD and SMCR link group lists Currently SMCD and SMCR link groups are maintained in one list. To facilitate abnormal termination handling they are split into a separate list for SMCR link groups and separate lists for SMCD link groups per SMCD device. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- include/net/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index bd9c0fb3b577..c08e8c415673 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -75,6 +75,7 @@ struct smcd_dev { struct workqueue_struct *event_wq; u8 pnetid[SMC_MAX_PNETID_LEN]; bool pnetid_by_user; + struct list_head lgr_list; }; struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, -- cgit v1.2.3 From a0a62ee15a829ebf8aeec55a4f1688230439b3e0 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 9 Oct 2019 10:07:44 +0200 Subject: net/smc: separate locks for SMCD and SMCR link group lists This patch introduces separate locks for the split SMCD and SMCR link group lists. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- include/net/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index c08e8c415673..438bb0261f45 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -76,6 +76,7 @@ struct smcd_dev { u8 pnetid[SMC_MAX_PNETID_LEN]; bool pnetid_by_user; struct list_head lgr_list; + spinlock_t lgr_lock; }; struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, -- cgit v1.2.3 From c3d9494e68c4a5d23227ede822fda9bd68bef8e3 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 9 Oct 2019 10:07:46 +0200 Subject: net/smc: no new connections on disappearing devices Add a "going_away" indication to ISM devices and IB ports and avoid creation of new connections on such disappearing devices. And do not handle ISM events if ISM device is disappearing. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- include/net/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index 438bb0261f45..05174ae4f325 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -77,6 +77,7 @@ struct smcd_dev { bool pnetid_by_user; struct list_head lgr_list; spinlock_t lgr_lock; + u8 going_away : 1; }; struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, -- cgit v1.2.3 From 977da0738f3ba3569b883ed6209c300bfcb695d4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 30 Sep 2019 14:14:35 +0200 Subject: crypto: inside-secure - Remove #ifdef checks When both PCI and OF are disabled, no drivers are registered, and we get some unused-function warnings: drivers/crypto/inside-secure/safexcel.c:1221:13: error: unused function 'safexcel_unregister_algorithms' [-Werror,-Wunused-function] static void safexcel_unregister_algorithms(struct safexcel_crypto_priv *priv) drivers/crypto/inside-secure/safexcel.c:1307:12: error: unused function 'safexcel_probe_generic' [-Werror,-Wunused-function] static int safexcel_probe_generic(void *pdev, drivers/crypto/inside-secure/safexcel.c:1531:13: error: unused function 'safexcel_hw_reset_rings' [-Werror,-Wunused-function] static void safexcel_hw_reset_rings(struct safexcel_crypto_priv *priv) It's better to make the compiler see what is going on and remove such ifdef checks completely. In case of PCI, this is trivial since pci_register_driver() is defined to an empty function that makes the compiler subsequently drop all unused code silently. The global pcireg_rc/ofreg_rc variables are not actually needed here since the driver registration does not fail in ways that would make it helpful. For CONFIG_OF, an IS_ENABLED() check is still required, since platform drivers can exist both with and without it. A little change to linux/pci.h is needed to ensure that pcim_enable_device() is visible to the driver. Moving the declaration outside of ifdef would be sufficient here, but for consistency with the rest of the file, adding an inline helper is probably best. Fixes: 212ef6f29e5b ("crypto: inside-secure - Fix unused variable warning when CONFIG_PCI=n") Signed-off-by: Arnd Bergmann Acked-by: Bjorn Helgaas # pci.h Signed-off-by: Herbert Xu --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index f9088c89a534..1a6cf19eac2d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1686,6 +1686,7 @@ static inline struct pci_dev *pci_get_class(unsigned int class, static inline void pci_set_master(struct pci_dev *dev) { } static inline int pci_enable_device(struct pci_dev *dev) { return -EIO; } static inline void pci_disable_device(struct pci_dev *dev) { } +static inline int pcim_enable_device(struct pci_dev *pdev) { return -EIO; } static inline int pci_assign_resource(struct pci_dev *dev, int i) { return -EBUSY; } static inline int __pci_register_driver(struct pci_driver *drv, -- cgit v1.2.3 From 84a081f60db63aaae3665118203506aa09a7f94f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Oct 2019 11:11:40 -0700 Subject: bpf: Align struct bpf_prog_stats Do not risk spanning these small structures on two cache lines. Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191011181140.2898-1-edumazet@google.com --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b9d22338606..282e28bf41ec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -363,7 +363,7 @@ struct bpf_prog_stats { u64 cnt; u64 nsecs; struct u64_stats_sync syncp; -}; +} __aligned(2 * sizeof(u64)); struct bpf_prog_aux { atomic_t refcnt; -- cgit v1.2.3 From e7a981050a7fb9a14b652365c00d9c5a025704ce Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 10 Oct 2019 15:18:49 +0200 Subject: devlink: propagate extack down to health reporter ops During health reporter operations, driver might want to fill-up the extack message, so propagate extack down to the health reporter ops. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4095657fc23f..6bf3b9e0595a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -507,11 +507,13 @@ enum devlink_health_reporter_state { struct devlink_health_reporter_ops { char *name; int (*recover)(struct devlink_health_reporter *reporter, - void *priv_ctx); + void *priv_ctx, struct netlink_ext_ack *extack); int (*dump)(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg, void *priv_ctx); + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack); int (*diagnose)(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg); + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack); }; /** -- cgit v1.2.3 From 6b7fe77c334ae59fed9500140e08f4f896b36871 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 9 Aug 2019 14:22:40 +0100 Subject: arm/arm64: smccc/psci: add arm_smccc_1_1_get_conduit() SMCCC callers are currently amassing a collection of enums for the SMCCC conduit, and are having to dig into the PSCI driver's internals in order to figure out what to do. Let's clean this up, with common SMCCC_CONDUIT_* definitions, and an arm_smccc_1_1_get_conduit() helper that abstracts the PSCI driver's internal state. We can kill off the PSCI_CONDUIT_* definitions once we've migrated users over to the new interface. Signed-off-by: Mark Rutland Acked-by: Lorenzo Pieralisi Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- include/linux/arm-smccc.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 080012a6f025..df01a8579034 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -80,6 +80,22 @@ #include #include + +enum arm_smccc_conduit { + SMCCC_CONDUIT_NONE, + SMCCC_CONDUIT_SMC, + SMCCC_CONDUIT_HVC, +}; + +/** + * arm_smccc_1_1_get_conduit() + * + * Returns the conduit to be used for SMCCCv1.1 or later. + * + * When SMCCCv1.1 is not present, returns SMCCC_CONDUIT_NONE. + */ +enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void); + /** * struct arm_smccc_res - Result from SMC/HVC call * @a0-a3 result values from registers 0 to 3 -- cgit v1.2.3 From a5520eac4d2dafb7a48c1b0f1c486afcebd6fe0d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 9 Aug 2019 14:22:43 +0100 Subject: firmware/psci: use common SMCCC_CONDUIT_* Now that we have common SMCCC_CONDUIT_* definitions, migrate the PSCI code over to them, and kill off the old PSCI_CONDUIT_* definitions. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Acked-by: Lorenzo Pieralisi Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- include/linux/psci.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/psci.h b/include/linux/psci.h index e2bacc6fd2f2..ebe0a881d13d 100644 --- a/include/linux/psci.h +++ b/include/linux/psci.h @@ -7,6 +7,7 @@ #ifndef __LINUX_PSCI_H #define __LINUX_PSCI_H +#include #include #include @@ -18,12 +19,6 @@ bool psci_tos_resident_on(int cpu); int psci_cpu_suspend_enter(u32 state); bool psci_power_state_is_valid(u32 state); -enum psci_conduit { - PSCI_CONDUIT_NONE, - PSCI_CONDUIT_SMC, - PSCI_CONDUIT_HVC, -}; - enum smccc_version { SMCCC_VERSION_1_0, SMCCC_VERSION_1_1, @@ -38,7 +33,7 @@ struct psci_operations { int (*affinity_info)(unsigned long target_affinity, unsigned long lowest_affinity_level); int (*migrate_info_type)(void); - enum psci_conduit conduit; + enum arm_smccc_conduit conduit; enum smccc_version smccc_version; }; -- cgit v1.2.3 From e6ea46511b1ae8c4491904c79411fcd29139af14 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 9 Aug 2019 14:22:44 +0100 Subject: firmware: arm_sdei: use common SMCCC_CONDUIT_* Now that we have common definitions for SMCCC conduits, move the SDEI code over to them, and remove the SDEI-specific definitions. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Acked-by: Lorenzo Pieralisi Acked-by: James Morse Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- include/linux/arm_sdei.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/arm_sdei.h b/include/linux/arm_sdei.h index 3305ea7f9dc7..0a241c5c911d 100644 --- a/include/linux/arm_sdei.h +++ b/include/linux/arm_sdei.h @@ -5,12 +5,6 @@ #include -enum sdei_conduit_types { - CONDUIT_INVALID = 0, - CONDUIT_SMC, - CONDUIT_HVC, -}; - #include #ifdef CONFIG_ARM_SDE_INTERFACE -- cgit v1.2.3 From 734f9246e791d8da278957b2c326d7709b2a97c0 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Wed, 11 Sep 2019 20:25:46 +0200 Subject: mm: refresh ZONE_DMA and ZONE_DMA32 comments in 'enum zone_type' These zones usage has evolved with time and the comments were outdated. This joins both ZONE_DMA and ZONE_DMA32 explanation and gives up to date examples on how they are used on different architectures. Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Christoph Hellwig Reviewed-by: Catalin Marinas Signed-off-by: Catalin Marinas --- include/linux/mmzone.h | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bda20282746b..b0a36d1580b6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -359,33 +359,40 @@ struct per_cpu_nodestat { #endif /* !__GENERATING_BOUNDS.H */ enum zone_type { -#ifdef CONFIG_ZONE_DMA /* - * ZONE_DMA is used when there are devices that are not able - * to do DMA to all of addressable memory (ZONE_NORMAL). Then we - * carve out the portion of memory that is needed for these devices. - * The range is arch specific. + * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able + * to DMA to all of the addressable memory (ZONE_NORMAL). + * On architectures where this area covers the whole 32 bit address + * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller + * DMA addressing constraints. This distinction is important as a 32bit + * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit + * platforms may need both zones as they support peripherals with + * different DMA addressing limitations. + * + * Some examples: + * + * - i386 and x86_64 have a fixed 16M ZONE_DMA and ZONE_DMA32 for the + * rest of the lower 4G. + * + * - arm only uses ZONE_DMA, the size, up to 4G, may vary depending on + * the specific device. + * + * - arm64 has a fixed 1G ZONE_DMA and ZONE_DMA32 for the rest of the + * lower 4G. * - * Some examples + * - powerpc only uses ZONE_DMA, the size, up to 2G, may vary + * depending on the specific device. * - * Architecture Limit - * --------------------------- - * parisc, ia64, sparc <4G - * s390, powerpc <2G - * arm Various - * alpha Unlimited or 0-16MB. + * - s390 uses ZONE_DMA fixed to the lower 2G. * - * i386, x86_64 and multiple other arches - * <16M. + * - ia64 and riscv only use ZONE_DMA32. + * + * - parisc uses neither. */ +#ifdef CONFIG_ZONE_DMA ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 - /* - * x86_64 needs two ZONE_DMAs because it supports devices that are - * only able to do DMA to the lower 16M but also 32 bit devices that - * can only do DMA areas below 4G. - */ ZONE_DMA32, #endif /* -- cgit v1.2.3 From b2c98153f45fc17b9fcb241000f2d131ddea6030 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 26 Sep 2019 13:51:30 +0300 Subject: spi: introduce spi_delay struct as "value + unit" & spi_delay_exec() There are plenty of delays that have been introduced in SPI core. Most of them are in micro-seconds, some need to be in nano-seconds, and some in clock-cycles. For some of these delays (related to transfers & CS timing) it may make sense to have a `spi_delay` struct that abstracts these a bit. The important element of these delays [for unification] seems to be the `unit` of the delay. It looks like micro-seconds is good enough for most people, but every-once in a while, some delays seem to require other units of measurement. This change adds the `spi_delay` struct & a `spi_delay_exec()` function that processes a `spi_delay` object/struct to execute the delay. It's a copy of the `cs_change_delay` mechanism, but without the default for 10 uS. The clock-cycle delay unit is a bit special, as it needs to be bound to an `spi_transfer` object to execute. Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20190926105147.7839-3-alexandru.ardelean@analog.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 27f6b046cf92..8f643de3197b 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -90,6 +90,21 @@ void spi_statistics_add_transfer_stats(struct spi_statistics *stats, #define SPI_STATISTICS_INCREMENT_FIELD(stats, field) \ SPI_STATISTICS_ADD_TO_FIELD(stats, field, 1) +/** + * struct spi_delay - SPI delay information + * @value: Value for the delay + * @unit: Unit for the delay + */ +struct spi_delay { +#define SPI_DELAY_UNIT_USECS 0 +#define SPI_DELAY_UNIT_NSECS 1 +#define SPI_DELAY_UNIT_SCK 2 + u16 value; + u8 unit; +}; + +extern int spi_delay_exec(struct spi_delay *_delay, struct spi_transfer *xfer); + /** * struct spi_device - Controller side proxy for an SPI slave device * @dev: Driver model representation of the device. @@ -887,9 +902,6 @@ struct spi_transfer { u16 delay_usecs; u16 cs_change_delay; u8 cs_change_delay_unit; -#define SPI_DELAY_UNIT_USECS 0 -#define SPI_DELAY_UNIT_NSECS 1 -#define SPI_DELAY_UNIT_SCK 2 u32 speed_hz; u16 word_delay; -- cgit v1.2.3 From 329f0dac4cad9fa4b1439a88180d91bcb5c4eaf8 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 26 Sep 2019 13:51:31 +0300 Subject: spi: make `cs_change_delay` the first user of the `spi_delay` logic Since the logic for `spi_delay` struct + `spi_delay_exec()` has been copied from the `cs_change_delay` logic, it's natural to make this delay, the first user. The `cs_change_delay` logic requires that the default remain 10 uS, in case it is unspecified/unconfigured. So, there is some special handling needed to do that. The ADIS library is one of the few users of the new `cs_change_delay` parameter for an spi_transfer. The introduction of the `spi_delay` struct, requires that the users of of `cs_change_delay` get an update. This change also updates the ADIS library. Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20190926105147.7839-4-alexandru.ardelean@analog.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 8f643de3197b..7670be934643 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -778,7 +778,6 @@ extern void spi_res_release(struct spi_controller *ctlr, * @cs_change: affects chipselect after this transfer completes * @cs_change_delay: delay between cs deassert and assert when * @cs_change is set and @spi_transfer is not the last in @spi_message - * @cs_change_delay_unit: unit of cs_change_delay * @delay_usecs: microseconds to delay after this transfer before * (optionally) changing the chipselect status, then starting * the next transfer or completing this @spi_message. @@ -900,8 +899,7 @@ struct spi_transfer { u8 bits_per_word; u8 word_delay_usecs; u16 delay_usecs; - u16 cs_change_delay; - u8 cs_change_delay_unit; + struct spi_delay cs_change_delay; u32 speed_hz; u16 word_delay; -- cgit v1.2.3 From 84593a131c3af21d686d05c4b4432290a415d399 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 26 Sep 2019 13:51:32 +0300 Subject: spi: sprd: convert transfer word delay to spi_delay struct The Spreadtrum SPI driver is the only user of the `word_delay` field in the `spi_transfer` struct. This change converts the field to use the `spi_delay` struct. This also enforces the users to specify the delay unit to be `SPI_DELAY_UNIT_SCK`. Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20190926105147.7839-5-alexandru.ardelean@analog.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 7670be934643..6cb67ad53ffa 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -783,7 +783,7 @@ extern void spi_res_release(struct spi_controller *ctlr, * the next transfer or completing this @spi_message. * @word_delay_usecs: microseconds to inter word delay after each word size * (set by bits_per_word) transmission. - * @word_delay: clock cycles to inter word delay after each word size + * @word_delay: inter word delay to be introduced after each word size * (set by bits_per_word) transmission. * @effective_speed_hz: the effective SCK-speed that was used to * transfer this transfer. Set to 0 if the spi bus driver does @@ -900,8 +900,8 @@ struct spi_transfer { u8 word_delay_usecs; u16 delay_usecs; struct spi_delay cs_change_delay; + struct spi_delay word_delay; u32 speed_hz; - u16 word_delay; u32 effective_speed_hz; -- cgit v1.2.3 From 6c613f68aabf33385c01e949204ac5ed30887161 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 26 Sep 2019 13:51:35 +0300 Subject: spi: core,atmel: convert `word_delay_usecs` -> `word_delay` for spi_device This change does a conversion from the `word_delay_usecs` -> `word_delay` for the `spi_device` struct. This allows users to specify inter-word delays in other unit types (nano-seconds or clock cycles), depending on how users want. The Atmel SPI driver is the only current user of the `word_delay_usecs` field (from the `spi_device` struct). So, it needed a slight conversion to use the `word_delay` as an `spi_delay` struct. In SPI core, the only required mechanism is to update the `word_delay` information per `spi_transfer`. This requires a bit more logic than before, because it needs that both delays be converted to a common unit (nano-seconds) for comparison. Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20190926105147.7839-8-alexandru.ardelean@analog.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 6cb67ad53ffa..ebeb272aeb0f 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -139,7 +139,7 @@ extern int spi_delay_exec(struct spi_delay *_delay, struct spi_transfer *xfer); * the spi_master. * @cs_gpiod: gpio descriptor of the chipselect line (optional, NULL when * not using a GPIO line) - * @word_delay_usecs: microsecond delay to be inserted between consecutive + * @word_delay: delay to be inserted between consecutive * words of a transfer * * @statistics: statistics for the spi_device @@ -189,7 +189,7 @@ struct spi_device { const char *driver_override; int cs_gpio; /* LEGACY: chip select gpio */ struct gpio_desc *cs_gpiod; /* chip select gpio desc */ - uint8_t word_delay_usecs; /* inter-word delay */ + struct spi_delay word_delay; /* inter-word delay */ /* the statistics */ struct spi_statistics statistics; @@ -781,8 +781,6 @@ extern void spi_res_release(struct spi_controller *ctlr, * @delay_usecs: microseconds to delay after this transfer before * (optionally) changing the chipselect status, then starting * the next transfer or completing this @spi_message. - * @word_delay_usecs: microseconds to inter word delay after each word size - * (set by bits_per_word) transmission. * @word_delay: inter word delay to be introduced after each word size * (set by bits_per_word) transmission. * @effective_speed_hz: the effective SCK-speed that was used to @@ -897,7 +895,6 @@ struct spi_transfer { #define SPI_NBITS_DUAL 0x02 /* 2bits transfer */ #define SPI_NBITS_QUAD 0x04 /* 4bits transfer */ u8 bits_per_word; - u8 word_delay_usecs; u16 delay_usecs; struct spi_delay cs_change_delay; struct spi_delay word_delay; -- cgit v1.2.3 From bebcfd272df648542c458d28fbd6a8f9428b5310 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 26 Sep 2019 13:51:36 +0300 Subject: spi: introduce `delay` field for `spi_transfer` + spi_transfer_delay_exec() The change introduces the `delay` field to the `spi_transfer` struct as an `struct spi_delay` type. This intends to eventually replace `delay_usecs`. But, since there are many users of `delay_usecs`, this needs some intermediate work. A helper called `spi_transfer_delay_exec()` is also added, which maintains backwards compatibility with `delay_usecs`, by assigning the value to `delay` if non-zero. This should maintain backwards compatibility with current users of `udelay_usecs`. Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20190926105147.7839-9-alexandru.ardelean@analog.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index ebeb272aeb0f..fe5b85df2c79 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -778,6 +778,9 @@ extern void spi_res_release(struct spi_controller *ctlr, * @cs_change: affects chipselect after this transfer completes * @cs_change_delay: delay between cs deassert and assert when * @cs_change is set and @spi_transfer is not the last in @spi_message + * @delay: delay to be introduced after this transfer before + * (optionally) changing the chipselect status, then starting + * the next transfer or completing this @spi_message. * @delay_usecs: microseconds to delay after this transfer before * (optionally) changing the chipselect status, then starting * the next transfer or completing this @spi_message. @@ -896,6 +899,7 @@ struct spi_transfer { #define SPI_NBITS_QUAD 0x04 /* 4bits transfer */ u8 bits_per_word; u16 delay_usecs; + struct spi_delay delay; struct spi_delay cs_change_delay; struct spi_delay word_delay; u32 speed_hz; @@ -1003,6 +1007,20 @@ spi_transfer_del(struct spi_transfer *t) list_del(&t->transfer_list); } +static inline int +spi_transfer_delay_exec(struct spi_transfer *t) +{ + struct spi_delay d; + + if (t->delay_usecs) { + d.value = t->delay_usecs; + d.unit = SPI_DELAY_UNIT_USECS; + return spi_delay_exec(&d, NULL); + } + + return spi_delay_exec(&t->delay, t); +} + /** * spi_message_init_with_transfers - Initialize spi_message and append transfers * @m: spi_message to be initialized -- cgit v1.2.3 From 8105936684681195d9073880b06a123b2e316811 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 26 Sep 2019 13:51:42 +0300 Subject: spi: tegra114: change format for `spi_set_cs_timing()` function The initial version of `spi_set_cs_timing()` was implemented with consideration only for clock-cycles as delay. For cases like `CS setup` time, it's sometimes needed that micro-seconds (or nano-seconds) are required, or sometimes even longer delays, for cases where the device needs a little longer to start transferring that after CS is asserted. Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20190926105147.7839-15-alexandru.ardelean@analog.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index fe5b85df2c79..f9b4ba2db08d 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -524,8 +524,8 @@ struct spi_controller { * to configure specific CS timing through spi_set_cs_timing() after * spi_setup(). */ - void (*set_cs_timing)(struct spi_device *spi, u8 setup_clk_cycles, - u8 hold_clk_cycles, u8 inactive_clk_cycles); + int (*set_cs_timing)(struct spi_device *spi, struct spi_delay *setup, + struct spi_delay *hold, struct spi_delay *inactive); /* bidirectional bulk transfers * @@ -1068,7 +1068,10 @@ static inline void spi_message_free(struct spi_message *m) kfree(m); } -extern void spi_set_cs_timing(struct spi_device *spi, u8 setup, u8 hold, u8 inactive_dly); +extern int spi_set_cs_timing(struct spi_device *spi, + struct spi_delay *setup, + struct spi_delay *hold, + struct spi_delay *inactive); extern int spi_setup(struct spi_device *spi); extern int spi_async(struct spi_device *spi, struct spi_message *message); -- cgit v1.2.3 From 25093bdeb6bcae728e12e3795261dbd3677060a9 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 26 Sep 2019 13:51:43 +0300 Subject: spi: implement SW control for CS times This change implements CS control for setup, hold & inactive delays. The `cs_setup` delay is completely new, and can help with cases where asserting the CS, also brings the device out of power-sleep, where there needs to be a longer (than usual), before transferring data. The `cs_hold` time can overlap with the `delay` (or `delay_usecs`) from an SPI transfer. The main difference is that `cs_hold` implies that CS will be de-asserted. The `cs_inactive` delay does not have a clear use-case yet. It has been implemented mostly because the `spi_set_cs_timing()` function implements it. To some degree, this could overlap or replace `cs_change_delay`, but this will require more consideration/investigation in the future. All these delays have been added to the `spi_controller` struct, as they would typically be configured by calling `spi_set_cs_timing()` after an `spi_setup()` call. Software-mode for CS control, implies that the `set_cs_timing()` hook has not been provided for the `spi_controller` object. Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20190926105147.7839-16-alexandru.ardelean@analog.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index f9b4ba2db08d..cfd87b18f077 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -609,6 +609,11 @@ struct spi_controller { /* Optimized handlers for SPI memory-like operations. */ const struct spi_controller_mem_ops *mem_ops; + /* CS delays */ + struct spi_delay cs_setup; + struct spi_delay cs_hold; + struct spi_delay cs_inactive; + /* gpio chip select */ int *cs_gpios; struct gpio_desc **cs_gpiods; -- cgit v1.2.3 From 3984d39b0e41ac4de8b4530ae3911ccf52ed4bbf Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 26 Sep 2019 13:51:44 +0300 Subject: spi: spi-fsl-espi: convert transfer delay to `spi_delay` format The way the max delay is computed for this controller, it looks like it is searching for the max delay from an SPI message a using that. No idea if this is valid. But this change should support both `delay_usecs` and the new `delay` data which is of `spi_delay` type. Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20190926105147.7839-17-alexandru.ardelean@analog.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index cfd87b18f077..c40d6af2bf07 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -103,6 +103,7 @@ struct spi_delay { u8 unit; }; +extern int spi_delay_to_ns(struct spi_delay *_delay, struct spi_transfer *xfer); extern int spi_delay_exec(struct spi_delay *_delay, struct spi_transfer *xfer); /** -- cgit v1.2.3 From f226650494c6aa87526d12135b7de8b8c074f3de Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 2 Oct 2019 10:06:12 +0100 Subject: arm64: Relax ICC_PMR_EL1 accesses when ICC_CTLR_EL1.PMHE is clear The GICv3 architecture specification is incredibly misleading when it comes to PMR and the requirement for a DSB. It turns out that this DSB is only required if the CPU interface sends an Upstream Control message to the redistributor in order to update the RD's view of PMR. This message is only sent when ICC_CTLR_EL1.PMHE is set, which isn't the case in Linux. It can still be set from EL3, so some special care is required. But the upshot is that in the (hopefuly large) majority of the cases, we can drop the DSB altogether. This relies on a new static key being set if the boot CPU has PMHE set. The drawback is that this static key has to be exported to modules. Cc: Will Deacon Cc: James Morse Cc: Julien Thierry Cc: Suzuki K Poulose Signed-off-by: Marc Zyngier Signed-off-by: Catalin Marinas --- include/linux/irqchip/arm-gic-v3.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 5cc10cf7cb3e..a0bde9e12efa 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -487,6 +487,8 @@ #define ICC_CTLR_EL1_EOImode_MASK (1 << ICC_CTLR_EL1_EOImode_SHIFT) #define ICC_CTLR_EL1_CBPR_SHIFT 0 #define ICC_CTLR_EL1_CBPR_MASK (1 << ICC_CTLR_EL1_CBPR_SHIFT) +#define ICC_CTLR_EL1_PMHE_SHIFT 6 +#define ICC_CTLR_EL1_PMHE_MASK (1 << ICC_CTLR_EL1_PMHE_SHIFT) #define ICC_CTLR_EL1_PRI_BITS_SHIFT 8 #define ICC_CTLR_EL1_PRI_BITS_MASK (0x7 << ICC_CTLR_EL1_PRI_BITS_SHIFT) #define ICC_CTLR_EL1_ID_BITS_SHIFT 11 -- cgit v1.2.3 From 14af7fd1d4279c8db7fbbb3ca0df3b13179eb502 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 12 Oct 2019 18:27:57 +0200 Subject: ethtool: Add support for 400Gbps (50Gbps per lane) link modes Add support for 400Gbps speed, link modes of 50Gbps per lane Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 8938b76c4ee3..d4591792f0b4 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1507,6 +1507,11 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67, ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68, + ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT = 69, + ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT = 70, + ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71, + ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, + ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS @@ -1618,6 +1623,7 @@ enum ethtool_link_mode_bit_indices { #define SPEED_56000 56000 #define SPEED_100000 100000 #define SPEED_200000 200000 +#define SPEED_400000 400000 #define SPEED_UNKNOWN -1 -- cgit v1.2.3 From 554032cdfbf4491f38241a3f6b27459408d90df3 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 15 Oct 2019 11:28:46 +0100 Subject: net: phylink: use more linkmode_* Use more linkmode_* helpers rather than open-coding the bitmap operations. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/linkmode.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h index a99c58866860..fe740031339d 100644 --- a/include/linux/linkmode.h +++ b/include/linux/linkmode.h @@ -82,4 +82,10 @@ static inline int linkmode_equal(const unsigned long *src1, return bitmap_equal(src1, src2, __ETHTOOL_LINK_MODE_MASK_NBITS); } +static inline int linkmode_subset(const unsigned long *src1, + const unsigned long *src2) +{ + return bitmap_subset(src1, src2, __ETHTOOL_LINK_MODE_MASK_NBITS); +} + #endif /* __LINKMODE_H */ -- cgit v1.2.3 From 2203cbf2c8b58a1e3bef98c47531d431d11639a0 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 15 Oct 2019 11:38:39 +0100 Subject: net: sfp: move fwnode parsing into sfp-bus layer Rather than parsing the sfp firmware node in phylink, parse it in the sfp-bus code, so we can re-use this code for PHYs without having to duplicate the parsing. Signed-off-by: Russell King Signed-off-by: David S. Miller --- include/linux/sfp.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 1c35428e98bc..355a08a76fd4 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -508,9 +508,9 @@ int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee, u8 *data); void sfp_upstream_start(struct sfp_bus *bus); void sfp_upstream_stop(struct sfp_bus *bus); -struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode, - void *upstream, - const struct sfp_upstream_ops *ops); +struct sfp_bus *sfp_register_upstream_node(struct fwnode_handle *fwnode, + void *upstream, + const struct sfp_upstream_ops *ops); void sfp_unregister_upstream(struct sfp_bus *bus); #else static inline int sfp_parse_port(struct sfp_bus *bus, @@ -553,11 +553,11 @@ static inline void sfp_upstream_stop(struct sfp_bus *bus) { } -static inline struct sfp_bus *sfp_register_upstream( +static inline struct sfp_bus *sfp_register_upstream_node( struct fwnode_handle *fwnode, void *upstream, const struct sfp_upstream_ops *ops) { - return (struct sfp_bus *)-1; + return NULL; } static inline void sfp_unregister_upstream(struct sfp_bus *bus) -- cgit v1.2.3 From 2ad9d7747c10d17cc06447944fefd4c29ae11eb1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 15 Oct 2019 15:19:15 +0200 Subject: netfilter: conntrack: free extension area immediately Instead of waiting for rcu grace period just free it directly. This is safe because conntrack lookup doesn't consider extensions. Other accesses happen while ct->ext can't be free'd, either because a ct refcount was taken or because the conntrack hash bucket lock or the dying list spinlock have been taken. This allows to remove __krealloc in a followup patch, netfilter was the only user. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_extend.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 112a6f40dfaf..5ae5295aa46d 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -43,7 +43,6 @@ enum nf_ct_ext_id { /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { - struct rcu_head rcu; u8 offset[NF_CT_EXT_NUM]; u8 len; char data[0]; @@ -72,15 +71,6 @@ static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id) /* Destroy all relationships */ void nf_ct_ext_destroy(struct nf_conn *ct); -/* Free operation. If you want to free a object referred from private area, - * please implement __nf_ct_ext_free() and call it. - */ -static inline void nf_ct_ext_free(struct nf_conn *ct) -{ - if (ct->ext) - kfree_rcu(ct->ext, rcu); -} - /* Add this type, returns pointer to data or NULL. */ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); -- cgit v1.2.3 From ca58fbe06c54795f00db79e447f94c2028d30124 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 11 Oct 2019 00:30:37 +0200 Subject: netfilter: add and use nf_hook_slow_list() At this time, NF_HOOK_LIST() macro will iterate the list and then calls nf_hook() for each individual skb. This makes it so the entire list is passed into the netfilter core. The advantage is that we only need to fetch the rule blob once per list instead of per-skb. NF_HOOK_LIST now only works for ipv4 and ipv6, as those are the only callers. v2: use skb_list_del_init() instead of list_del (Edward Cree) Signed-off-by: Florian Westphal Acked-by: Edward Cree Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 77ebb61faf48..eb312e7ca36e 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -199,6 +199,8 @@ extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *e, unsigned int i); +void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, + const struct nf_hook_entries *e); /** * nf_hook - call a netfilter hook * @@ -311,17 +313,36 @@ NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { - struct sk_buff *skb, *next; - struct list_head sublist; - - INIT_LIST_HEAD(&sublist); - list_for_each_entry_safe(skb, next, head, list) { - list_del(&skb->list); - if (nf_hook(pf, hook, net, sk, skb, in, out, okfn) == 1) - list_add_tail(&skb->list, &sublist); + struct nf_hook_entries *hook_head = NULL; + +#ifdef CONFIG_JUMP_LABEL + if (__builtin_constant_p(pf) && + __builtin_constant_p(hook) && + !static_key_false(&nf_hooks_needed[pf][hook])) + return; +#endif + + rcu_read_lock(); + switch (pf) { + case NFPROTO_IPV4: + hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); + break; + case NFPROTO_IPV6: + hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); + break; + default: + WARN_ON_ONCE(1); + break; } - /* Put passed packets back on main list */ - list_splice(&sublist, head); + + if (hook_head) { + struct nf_hook_state state; + + nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn); + + nf_hook_slow_list(head, &state, hook_head); + } + rcu_read_unlock(); } /* Call setsockopt() */ -- cgit v1.2.3 From 3d6d8da48d0b214d65ea0227d47228abc75d7c88 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:28 +0200 Subject: pidfd: check pid has attached task in fdinfo Currently, when a task is dead we still print the pid it used to use in the fdinfo files of its pidfds. This doesn't make much sense since the pid may have already been reused. So verify that the task is still alive by introducing the pid_has_task() helper which will be used by other callers in follow-up patches. If the task is not alive anymore, we will print -1. This allows us to differentiate between a task not being present in a given pid namespace - in which case we already print 0 - and a task having been reaped. Note that this uses PIDTYPE_PID for the check. Technically, we could've checked PIDTYPE_TGID since pidfds currently only refer to thread-group leaders but if they won't anymore in the future then this check becomes problematic without it being immediately obvious to non-experts imho. If a thread is created via clone(CLONE_THREAD) than struct pid has a single non-empty list pid->tasks[PIDTYPE_PID] and this pid can't be used as a PIDTYPE_TGID meaning pid->tasks[PIDTYPE_TGID] will return NULL even though the thread-group leader might still be very much alive. So checking PIDTYPE_PID is fine and is easier to maintain should we ever allow pidfds to refer to threads. Cc: Jann Horn Cc: Christian Kellner Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-1-christian.brauner@ubuntu.com --- include/linux/pid.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/pid.h b/include/linux/pid.h index 9645b1194c98..034e3cd60dc0 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -85,6 +85,10 @@ static inline struct pid *get_pid(struct pid *pid) extern void put_pid(struct pid *pid); extern struct task_struct *pid_task(struct pid *pid, enum pid_type); +static inline bool pid_has_task(struct pid *pid, enum pid_type type) +{ + return !hlist_empty(&pid->tasks[type]); +} extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type); extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); -- cgit v1.2.3 From 57f5677e535ba24b8926a7125be2ef8d7f09323c Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 15 Oct 2019 21:07:05 +0200 Subject: printf: add support for printing symbolic error names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It has been suggested several times to extend vsnprintf() to be able to convert the numeric value of ENOSPC to print "ENOSPC". This implements that as a %p extension: With %pe, one can do if (IS_ERR(foo)) { pr_err("Sorry, can't do that: %pe\n", foo); return PTR_ERR(foo); } instead of what is seen in quite a few places in the kernel: if (IS_ERR(foo)) { pr_err("Sorry, can't do that: %ld\n", PTR_ERR(foo)); return PTR_ERR(foo); } If the value passed to %pe is an ERR_PTR, but the library function errname() added here doesn't know about the value, the value is simply printed in decimal. If the value passed to %pe is not an ERR_PTR, we treat it as an ordinary %p and thus print the hashed value (passing non-ERR_PTR values to %pe indicates a bug in the caller, but we can't do much about that). With my embedded hat on, and because it's not very invasive to do, I've made it possible to remove this. The errname() function and associated lookup tables take up about 3K. For most, that's probably quite acceptable and a price worth paying for more readable dmesg (once this starts getting used), while for those that disable printk() it's of very little use - I don't see a procfs/sysfs/seq_printf() file reasonably making use of this - and they clearly want to squeeze vmlinux as much as possible. Hence the default y if PRINTK. The symbols to include have been found by massaging the output of find arch include -iname 'errno*.h' | xargs grep -E 'define\s*E' In the cases where some common aliasing exists (e.g. EAGAIN=EWOULDBLOCK on all platforms, EDEADLOCK=EDEADLK on most), I've moved the more popular one (in terms of 'git grep -w Efoo | wc) to the bottom so that one takes precedence. Link: http://lkml.kernel.org/r/20191015190706.15989-1-linux@rasmusvillemoes.dk To: "Jonathan Corbet" To: linux-kernel@vger.kernel.org Cc: "Andy Shevchenko" Cc: "Andrew Morton" Cc: "Joe Perches" Cc: linux-doc@vger.kernel.org Signed-off-by: Rasmus Villemoes Acked-by: Uwe Kleine-König Reviewed-by: Petr Mladek [andy.shevchenko@gmail.com: use abs()] Acked-by: Andy Shevchenko Signed-off-by: Petr Mladek --- include/linux/errname.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/linux/errname.h (limited to 'include') diff --git a/include/linux/errname.h b/include/linux/errname.h new file mode 100644 index 000000000000..e8576ad90cb7 --- /dev/null +++ b/include/linux/errname.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ERRNAME_H +#define _LINUX_ERRNAME_H + +#include + +#ifdef CONFIG_SYMBOLIC_ERRNAME +const char *errname(int err); +#else +static inline const char *errname(int err) +{ + return NULL; +} +#endif + +#endif /* _LINUX_ERRNAME_H */ -- cgit v1.2.3 From e8c423fb31fa8b1ef6d7cd14a168de33e7c0d702 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:55 -0700 Subject: bpf: Add typecast to raw_tracepoints to help BTF generation When pahole converts dwarf to btf it emits only used types. Wrap existing __bpf_trace_##template() function into btf_trace_##template typedef and use it in type cast to make gcc emits this type into dwarf. Then pahole will convert it to btf. The "btf_trace_" prefix will be used to identify BTF enabled raw tracepoints. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-2-ast@kernel.org --- include/trace/bpf_probe.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index d6e556c0a085..b04c29270973 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -74,11 +74,12 @@ static inline void bpf_test_probe_##call(void) \ { \ check_trace_callback_type_##call(__bpf_trace_##template); \ } \ +typedef void (*btf_trace_##call)(void *__data, proto); \ static struct bpf_raw_event_map __used \ __attribute__((section("__bpf_raw_tp_map"))) \ __bpf_trace_tp_map_##call = { \ .tp = &__tracepoint_##call, \ - .bpf_func = (void *)__bpf_trace_##template, \ + .bpf_func = (void *)(btf_trace_##call)__bpf_trace_##template, \ .num_args = COUNT_ARGS(args), \ .writable_size = size, \ }; -- cgit v1.2.3 From 7c6a469e3416fa23568c2395a3faa7dd6e376dcb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:56 -0700 Subject: bpf: Add typecast to bpf helpers to help BTF generation When pahole converts dwarf to btf it emits only used types. Wrap existing bpf helper functions into typedef and use it in typecast to make gcc emits this type into dwarf. Then pahole will convert it to btf. The "btf_#name_of_helper" types will be used to figure out types of arguments of bpf helpers. The generated code before and after is the same. Only dwarf and btf sections are different. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-3-ast@kernel.org --- include/linux/filter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 2ce57645f3cd..d3d51d7aff2c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -464,10 +464,11 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) #define BPF_CALL_x(x, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ + typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ - return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ + return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) -- cgit v1.2.3 From 8580ac9404f6240668a026785d7d8856f0530409 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:57 -0700 Subject: bpf: Process in-kernel BTF If in-kernel BTF exists parse it and prepare 'struct btf *btf_vmlinux' for further use by the verifier. In-kernel BTF is trusted just like kallsyms and other build artifacts embedded into vmlinux. Yet run this BTF image through BTF verifier to make sure that it is valid and it wasn't mangled during the build. If in-kernel BTF is incorrect it means either gcc or pahole or kernel are buggy. In such case disallow loading BPF programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-4-ast@kernel.org --- include/linux/bpf_verifier.h | 4 +++- include/linux/btf.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 26a6d58ca78c..713efae62e96 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -330,10 +330,12 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) #define BPF_LOG_STATS 4 #define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) #define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) +#define BPF_LOG_KERNEL (BPF_LOG_MASK + 1) /* kernel internal flag */ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { - return log->level && log->ubuf && !bpf_verifier_log_full(log); + return (log->level && log->ubuf && !bpf_verifier_log_full(log)) || + log->level == BPF_LOG_KERNEL; } #define BPF_MAX_SUBPROGS 256 diff --git a/include/linux/btf.h b/include/linux/btf.h index 64cdf2a23d42..55d43bc856be 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -56,6 +56,7 @@ bool btf_type_is_void(const struct btf_type *t); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); +struct btf *btf_parse_vmlinux(void); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) -- cgit v1.2.3 From ccfe29eb29c2edcea6552072ef00ff4117f53e83 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:58 -0700 Subject: bpf: Add attach_btf_id attribute to program load Add attach_btf_id attribute to prog_load command. It's similar to existing expected_attach_type attribute which is used in several cgroup based program types. Unfortunately expected_attach_type is ignored for tracing programs and cannot be reused for new purpose. Hence introduce attach_btf_id to verify bpf programs against given in-kernel BTF type id at load time. It is strictly checked to be valid for raw_tp programs only. In a later patches it will become: btf_id == 0 semantics of existing raw_tp progs. btd_id > 0 raw_tp with BTF and additional type safety. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-5-ast@kernel.org --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 282e28bf41ec..f916380675dd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -375,6 +375,7 @@ struct bpf_prog_aux { u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ + u32 attach_btf_id; /* in-kernel BTF type id to attach to */ bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; struct bpf_prog **func; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a65c3b0c6935..3bb2cd1de341 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -420,6 +420,7 @@ union bpf_attr { __u32 line_info_rec_size; /* userspace bpf_line_info size */ __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ + __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From 9e15db66136a14cde3f35691f1d839d950118826 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:00 -0700 Subject: bpf: Implement accurate raw_tp context access via BTF libbpf analyzes bpf C program, searches in-kernel BTF for given type name and stores it into expected_attach_type. The kernel verifier expects this btf_id to point to something like: typedef void (*btf_trace_kfree_skb)(void *, struct sk_buff *skb, void *loc); which represents signature of raw_tracepoint "kfree_skb". Then btf_ctx_access() matches ctx+0 access in bpf program with 'skb' and 'ctx+8' access with 'loc' arguments of "kfree_skb" tracepoint. In first case it passes btf_id of 'struct sk_buff *' back to the verifier core and 'void *' in second case. Then the verifier tracks PTR_TO_BTF_ID as any other pointer type. Like PTR_TO_SOCKET points to 'struct bpf_sock', PTR_TO_TCP_SOCK points to 'struct bpf_tcp_sock', and so on. PTR_TO_BTF_ID points to in-kernel structs. If 1234 is btf_id of 'struct sk_buff' in vmlinux's BTF then PTR_TO_BTF_ID#1234 points to one of in kernel skbs. When PTR_TO_BTF_ID#1234 is dereferenced (like r2 = *(u64 *)r1 + 32) the btf_struct_access() checks which field of 'struct sk_buff' is at offset 32. Checks that size of access matches type definition of the field and continues to track the dereferenced type. If that field was a pointer to 'struct net_device' the r2's type will be PTR_TO_BTF_ID#456. Where 456 is btf_id of 'struct net_device' in vmlinux's BTF. Such verifier analysis prevents "cheating" in BPF C program. The program cannot cast arbitrary pointer to 'struct sk_buff *' and access it. C compiler would allow type cast, of course, but the verifier will notice type mismatch based on BPF assembly and in-kernel BTF. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-7-ast@kernel.org --- include/linux/bpf.h | 17 ++++++++++++++++- include/linux/bpf_verifier.h | 4 ++++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f916380675dd..028555fcd10d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -16,6 +16,7 @@ #include struct bpf_verifier_env; +struct bpf_verifier_log; struct perf_event; struct bpf_prog; struct bpf_map; @@ -281,6 +282,7 @@ enum bpf_reg_type { PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ + PTR_TO_BTF_ID, /* reg points to kernel struct */ }; /* The information passed from prog-specific *_is_valid_access @@ -288,7 +290,11 @@ enum bpf_reg_type { */ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; - int ctx_field_size; + union { + int ctx_field_size; + u32 btf_id; + }; + struct bpf_verifier_log *log; /* for verbose logs */ }; static inline void @@ -483,6 +489,7 @@ struct bpf_event_entry { bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); +const char *kernel_type_name(u32 btf_type_id); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); @@ -748,6 +755,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +bool btf_ctx_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info); +int btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, + u32 *next_btf_id); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 713efae62e96..6e7284ea1468 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -52,6 +52,8 @@ struct bpf_reg_state { */ struct bpf_map *map_ptr; + u32 btf_id; /* for PTR_TO_BTF_ID */ + /* Max size from any of the above. */ unsigned long raw; }; @@ -399,6 +401,8 @@ __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); +__printf(2, 3) void bpf_log(struct bpf_verifier_log *log, + const char *fmt, ...); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { -- cgit v1.2.3 From 2a02759ef5f8a34792df22b41d5e10658fd7bbd3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:02 -0700 Subject: bpf: Add support for BTF pointers to interpreter Pointer to BTF object is a pointer to kernel object or NULL. The memory access in the interpreter has to be done via probe_kernel_read to avoid page faults. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-9-ast@kernel.org --- include/linux/filter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index d3d51d7aff2c..22ebea2e64ea 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -65,6 +65,9 @@ struct ctl_table_header; /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 +/* unused opcode to mark special load instruction. Same as BPF_ABS */ +#define BPF_PROBE_MEM 0x20 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 -- cgit v1.2.3 From 3dec541b2e632d630fe7142ed44f0b3702ef1f8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:03 -0700 Subject: bpf: Add support for BTF pointers to x86 JIT Pointer to BTF object is a pointer to kernel object or NULL. Such pointers can only be used by BPF_LDX instructions. The verifier changed their opcode from LDX|MEM|size to LDX|PROBE_MEM|size to make JITing easier. The number of entries in extable is the number of BPF_LDX insns that access kernel memory via "pointer to BTF type". Only these load instructions can fault. Since x86 extable is relative it has to be allocated in the same memory region as JITed code. Allocate it prior to last pass of JITing and let the last pass populate it. Pointer to extable in bpf_prog_aux is necessary to make page fault handling fast. Page fault handling is done in two steps: 1. bpf_prog_kallsyms_find() finds BPF program that page faulted. It's done by walking rb tree. 2. then extable for given bpf program is binary searched. This process is similar to how page faulting is done for kernel modules. The exception handler skips over faulting x86 instruction and initializes destination register with zero. This mimics exact behavior of bpf_probe_read (when probe_kernel_read faults dest is zeroed). JITs for other architectures can add support in similar way. Until then they will reject unknown opcode and fallback to interpreter. Since extable should be aligned and placed near JITed code make bpf_jit_binary_alloc() return 4 byte aligned image offset, so that extable aligning formula in bpf_int_jit_compile() doesn't need to rely on internal implementation of bpf_jit_binary_alloc(). On x86 gcc defaults to 16-byte alignment for regular kernel functions due to better performance. JITed code may be aligned to 16 in the future, but it will use 4 in the meantime. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-10-ast@kernel.org --- include/linux/bpf.h | 3 +++ include/linux/extable.h | 10 ++++++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 028555fcd10d..a7330d75bb94 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -24,6 +24,7 @@ struct sock; struct seq_file; struct btf; struct btf_type; +struct exception_table_entry; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -423,6 +424,8 @@ struct bpf_prog_aux { * main prog always has linfo_idx == 0 */ u32 linfo_idx; + u32 num_exentries; + struct exception_table_entry *extable; struct bpf_prog_stats __percpu *stats; union { struct work_struct work; diff --git a/include/linux/extable.h b/include/linux/extable.h index 81ecfaa83ad3..4ab9e78f313b 100644 --- a/include/linux/extable.h +++ b/include/linux/extable.h @@ -33,4 +33,14 @@ search_module_extables(unsigned long addr) } #endif /*CONFIG_MODULES*/ +#ifdef CONFIG_BPF_JIT +const struct exception_table_entry *search_bpf_extables(unsigned long addr); +#else +static inline const struct exception_table_entry * +search_bpf_extables(unsigned long addr) +{ + return NULL; +} +#endif + #endif /* _LINUX_EXTABLE_H */ -- cgit v1.2.3 From a7658e1a4164ce2b9eb4a11aadbba38586e93bd6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:04 -0700 Subject: bpf: Check types of arguments passed into helpers Introduce new helper that reuses existing skb perf_event output implementation, but can be called from raw_tracepoint programs that receive 'struct sk_buff *' as tracepoint argument or can walk other kernel data structures to skb pointer. In order to do that teach verifier to resolve true C types of bpf helpers into in-kernel BTF ids. The type of kernel pointer passed by raw tracepoint into bpf program will be tracked by the verifier all the way until it's passed into helper function. For example: kfree_skb() kernel function calls trace_kfree_skb(skb, loc); bpf programs receives that skb pointer and may eventually pass it into bpf_skb_output() bpf helper which in-kernel is implemented via bpf_skb_event_output() kernel function. Its first argument in the kernel is 'struct sk_buff *'. The verifier makes sure that types match all the way. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-11-ast@kernel.org --- include/linux/bpf.h | 18 +++++++++++++----- include/uapi/linux/bpf.h | 27 ++++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a7330d75bb94..2c2c29b49845 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -213,6 +213,7 @@ enum bpf_arg_type { ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ + ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ }; /* type of values returned from helper functions */ @@ -235,11 +236,17 @@ struct bpf_func_proto { bool gpl_only; bool pkt_access; enum bpf_return_type ret_type; - enum bpf_arg_type arg1_type; - enum bpf_arg_type arg2_type; - enum bpf_arg_type arg3_type; - enum bpf_arg_type arg4_type; - enum bpf_arg_type arg5_type; + union { + struct { + enum bpf_arg_type arg1_type; + enum bpf_arg_type arg2_type; + enum bpf_arg_type arg3_type; + enum bpf_arg_type arg4_type; + enum bpf_arg_type arg5_type; + }; + enum bpf_arg_type arg_type[5]; + }; + u32 *btf_id; /* BTF ids of arguments */ }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -765,6 +772,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); +u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3bb2cd1de341..4af8b0819a32 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2751,6 +2751,30 @@ union bpf_attr { * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2863,7 +2887,8 @@ union bpf_attr { FN(sk_storage_get), \ FN(sk_storage_delete), \ FN(send_signal), \ - FN(tcp_gen_syncookie), + FN(tcp_gen_syncookie), \ + FN(skb_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 4f3d957718e7f0ac2b033dbf48c7cddecd0a8dd3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 18 Oct 2019 13:54:25 +0300 Subject: spi: pxa2xx: No need to keep pointer to platform device There is no need to keep a pointer to the platform device. Currently there are no users of it directly, and if there will be in the future we may restore it from pointer to the struct device. Convert all users at the same time. Cc: Russell King Cc: Jaroslav Kysela Cc: Takashi Iwai Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20191018105429.82782-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/pxa2xx_ssp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h index a5d1837e4f35..6facf27865f9 100644 --- a/include/linux/pxa2xx_ssp.h +++ b/include/linux/pxa2xx_ssp.h @@ -206,7 +206,7 @@ enum pxa_ssp_type { }; struct ssp_device { - struct platform_device *pdev; + struct device *dev; struct list_head node; struct clk *clk; -- cgit v1.2.3 From 1a9167a214f560a23c5050ce6dfebae489528f0d Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Wed, 19 Jun 2019 13:01:27 -0300 Subject: KVM: PPC: Report single stepping capability When calling the KVM_SET_GUEST_DEBUG ioctl, userspace might request the next instruction to be single stepped via the KVM_GUESTDBG_SINGLESTEP control bit of the kvm_guest_debug structure. This patch adds the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability in order to inform userspace about the state of single stepping support. We currently don't have support for guest single stepping implemented in Book3S HV so the capability is only present for Book3S PR and BookE. Signed-off-by: Fabiano Rosas Signed-off-by: Paul Mackerras --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52641d8ca9e8..ce8cfcc51aec 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1000,6 +1000,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PMU_EVENT_FILTER 173 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175 +#define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From c726200dd106d4c58a281eea7159b8ba28a4ab34 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 11 Oct 2019 13:07:05 +0200 Subject: KVM: arm/arm64: Allow reporting non-ISV data aborts to userspace For a long time, if a guest accessed memory outside of a memslot using any of the load/store instructions in the architecture which doesn't supply decoding information in the ESR_EL2 (the ISV bit is not set), the kernel would print the following message and terminate the VM as a result of returning -ENOSYS to userspace: load/store instruction decoding not implemented The reason behind this message is that KVM assumes that all accesses outside a memslot is an MMIO access which should be handled by userspace, and we originally expected to eventually implement some sort of decoding of load/store instructions where the ISV bit was not set. However, it turns out that many of the instructions which don't provide decoding information on abort are not safe to use for MMIO accesses, and the remaining few that would potentially make sense to use on MMIO accesses, such as those with register writeback, are not used in practice. It also turns out that fetching an instruction from guest memory can be a pretty horrible affair, involving stopping all CPUs on SMP systems, handling multiple corner cases of address translation in software, and more. It doesn't appear likely that we'll ever implement this in the kernel. What is much more common is that a user has misconfigured his/her guest and is actually not accessing an MMIO region, but just hitting some random hole in the IPA space. In this scenario, the error message above is almost misleading and has led to a great deal of confusion over the years. It is, nevertheless, ABI to userspace, and we therefore need to introduce a new capability that userspace explicitly enables to change behavior. This patch introduces KVM_CAP_ARM_NISV_TO_USER (NISV meaning Non-ISV) which does exactly that, and introduces a new exit reason to report the event to userspace. User space can then emulate an exception to the guest, restart the guest, suspend the guest, or take any other appropriate action as per the policy of the running system. Reported-by: Heinrich Schuchardt Signed-off-by: Christoffer Dall Reviewed-by: Alexander Graf Signed-off-by: Marc Zyngier --- include/uapi/linux/kvm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52641d8ca9e8..7336ee8d98d7 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -235,6 +235,7 @@ struct kvm_hyperv_exit { #define KVM_EXIT_S390_STSI 25 #define KVM_EXIT_IOAPIC_EOI 26 #define KVM_EXIT_HYPERV 27 +#define KVM_EXIT_ARM_NISV 28 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -394,6 +395,11 @@ struct kvm_run { } eoi; /* KVM_EXIT_HYPERV */ struct kvm_hyperv_exit hyperv; + /* KVM_EXIT_ARM_NISV */ + struct { + __u64 esr_iss; + __u64 fault_ipa; + } arm_nisv; /* Fix the size of the union. */ char padding[256]; }; @@ -1000,6 +1006,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PMU_EVENT_FILTER 173 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175 +#define KVM_CAP_ARM_NISV_TO_USER 176 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From da345174ceca052469e4775e4ae263b5f27a9355 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 11 Oct 2019 13:07:06 +0200 Subject: KVM: arm/arm64: Allow user injection of external data aborts In some scenarios, such as buggy guest or incorrect configuration of the VMM and firmware description data, userspace will detect a memory access to a portion of the IPA, which is not mapped to any MMIO region. For this purpose, the appropriate action is to inject an external abort to the guest. The kernel already has functionality to inject an external abort, but we need to wire up a signal from user space that lets user space tell the kernel to do this. It turns out, we already have the set event functionality which we can perfectly reuse for this. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7336ee8d98d7..65db5a4257ec 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1007,6 +1007,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175 #define KVM_CAP_ARM_NISV_TO_USER 176 +#define KVM_CAP_ARM_INJECT_EXT_DABT 177 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 55009c6ed2d24fc0f5521ab2482f145d269389ea Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 21 Oct 2019 16:28:15 +0100 Subject: KVM: arm/arm64: Factor out hypercall handling from PSCI code We currently intertwine the KVM PSCI implementation with the general dispatch of hypercall handling, which makes perfect sense because PSCI is the only category of hypercalls we support. However, as we are about to support additional hypercalls, factor out this functionality into a separate hypercall handler file. Signed-off-by: Christoffer Dall [steven.price@arm.com: rebased] Reviewed-by: Andrew Jones Signed-off-by: Steven Price Signed-off-by: Marc Zyngier --- include/Kbuild | 2 ++ include/kvm/arm_hypercalls.h | 43 +++++++++++++++++++++++++++++++++++++++++++ include/kvm/arm_psci.h | 2 +- 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 include/kvm/arm_hypercalls.h (limited to 'include') diff --git a/include/Kbuild b/include/Kbuild index ffba79483cc5..e8154f8bcac5 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -67,6 +67,8 @@ header-test- += keys/big_key-type.h header-test- += keys/request_key_auth-type.h header-test- += keys/trusted.h header-test- += kvm/arm_arch_timer.h +header-test-$(CONFIG_ARM) += kvm/arm_hypercalls.h +header-test-$(CONFIG_ARM64) += kvm/arm_hypercalls.h header-test- += kvm/arm_pmu.h header-test-$(CONFIG_ARM) += kvm/arm_psci.h header-test-$(CONFIG_ARM64) += kvm/arm_psci.h diff --git a/include/kvm/arm_hypercalls.h b/include/kvm/arm_hypercalls.h new file mode 100644 index 000000000000..0e2509d27910 --- /dev/null +++ b/include/kvm/arm_hypercalls.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2019 Arm Ltd. */ + +#ifndef __KVM_ARM_HYPERCALLS_H +#define __KVM_ARM_HYPERCALLS_H + +#include + +int kvm_hvc_call_handler(struct kvm_vcpu *vcpu); + +static inline u32 smccc_get_function(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 0); +} + +static inline unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 1); +} + +static inline unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 2); +} + +static inline unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 3); +} + +static inline void smccc_set_retval(struct kvm_vcpu *vcpu, + unsigned long a0, + unsigned long a1, + unsigned long a2, + unsigned long a3) +{ + vcpu_set_reg(vcpu, 0, a0); + vcpu_set_reg(vcpu, 1, a1); + vcpu_set_reg(vcpu, 2, a2); + vcpu_set_reg(vcpu, 3, a3); +} + +#endif diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h index 632e78bdef4d..5b58bd2fe088 100644 --- a/include/kvm/arm_psci.h +++ b/include/kvm/arm_psci.h @@ -40,7 +40,7 @@ static inline int kvm_psci_version(struct kvm_vcpu *vcpu, struct kvm *kvm) } -int kvm_hvc_call_handler(struct kvm_vcpu *vcpu); +int kvm_psci_call(struct kvm_vcpu *vcpu); struct kvm_one_reg; -- cgit v1.2.3 From b48c1a45a190898103cec28771efc399fd65a05a Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 21 Oct 2019 16:28:16 +0100 Subject: KVM: arm64: Implement PV_TIME_FEATURES call This provides a mechanism for querying which paravirtualized time features are available in this hypervisor. Also add the header file which defines the ABI for the paravirtualized time features we're about to add. Signed-off-by: Steven Price Signed-off-by: Marc Zyngier --- include/linux/arm-smccc.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index df01a8579034..92e0046ce7a7 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -45,6 +45,7 @@ #define ARM_SMCCC_OWNER_SIP 2 #define ARM_SMCCC_OWNER_OEM 3 #define ARM_SMCCC_OWNER_STANDARD 4 +#define ARM_SMCCC_OWNER_STANDARD_HYP 5 #define ARM_SMCCC_OWNER_TRUSTED_APP 48 #define ARM_SMCCC_OWNER_TRUSTED_APP_END 49 #define ARM_SMCCC_OWNER_TRUSTED_OS 50 @@ -318,5 +319,18 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, #define SMCCC_RET_NOT_SUPPORTED -1 #define SMCCC_RET_NOT_REQUIRED -2 +/* Paravirtualised time calls (defined by ARM DEN0057A) */ +#define ARM_SMCCC_HV_PV_TIME_FEATURES \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x20) + +#define ARM_SMCCC_HV_PV_TIME_ST \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x21) + #endif /*__ASSEMBLY__*/ #endif /*__LINUX_ARM_SMCCC_H*/ -- cgit v1.2.3 From cac0f1b7285eaaf9a186c618c3a7304d82ed5493 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 21 Oct 2019 16:28:17 +0100 Subject: KVM: Implement kvm_put_guest() kvm_put_guest() is analogous to put_user() - it writes a single value to the guest physical address. The implementation is built upon put_user() and so it has the same single copy atomic properties. Signed-off-by: Steven Price Signed-off-by: Marc Zyngier --- include/linux/kvm_host.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 719fc3e15ea4..9907e45f8875 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -746,6 +746,28 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, unsigned long len); int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len); + +#define __kvm_put_guest(kvm, gfn, offset, value, type) \ +({ \ + unsigned long __addr = gfn_to_hva(kvm, gfn); \ + type __user *__uaddr = (type __user *)(__addr + offset); \ + int __ret = -EFAULT; \ + \ + if (!kvm_is_error_hva(__addr)) \ + __ret = put_user(value, __uaddr); \ + if (!__ret) \ + mark_page_dirty(kvm, gfn); \ + __ret; \ +}) + +#define kvm_put_guest(kvm, gpa, value, type) \ +({ \ + gpa_t __gpa = gpa; \ + struct kvm *__kvm = kvm; \ + __kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT, \ + offset_in_page(__gpa), (value), type); \ +}) + int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); -- cgit v1.2.3 From 8564d6372a7d8a6d440441b8ed8020f97f744450 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 21 Oct 2019 16:28:18 +0100 Subject: KVM: arm64: Support stolen time reporting via shared structure Implement the service call for configuring a shared structure between a VCPU and the hypervisor in which the hypervisor can write the time stolen from the VCPU's execution time by other tasks on the host. User space allocates memory which is placed at an IPA also chosen by user space. The hypervisor then updates the shared structure using kvm_put_guest() to ensure single copy atomicity of the 64-bit value reporting the stolen time in nanoseconds. Whenever stolen time is enabled by the guest, the stolen time counter is reset. The stolen time itself is retrieved from the sched_info structure maintained by the Linux scheduler code. We enable SCHEDSTATS when selecting KVM Kconfig to ensure this value is meaningful. Signed-off-by: Steven Price Signed-off-by: Marc Zyngier --- include/linux/kvm_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index bde5374ae021..1c88e69db3d9 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -35,6 +35,8 @@ typedef unsigned long gva_t; typedef u64 gpa_t; typedef u64 gfn_t; +#define GPA_INVALID (~(gpa_t)0) + typedef unsigned long hva_t; typedef u64 hpa_t; typedef u64 hfn_t; -- cgit v1.2.3 From 8538cb22bbce5a988671b68baf0b0f9e86ca1e87 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 21 Oct 2019 16:28:19 +0100 Subject: KVM: Allow kvm_device_ops to be const Currently a kvm_device_ops structure cannot be const without triggering compiler warnings. However the structure doesn't need to be written to and, by marking it const, it can be read-only in memory. Add some more const keywords to allow this. Reviewed-by: Andrew Jones Signed-off-by: Steven Price Signed-off-by: Marc Zyngier --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9907e45f8875..7a26d5513471 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1262,7 +1262,7 @@ extern unsigned int halt_poll_ns_grow_start; extern unsigned int halt_poll_ns_shrink; struct kvm_device { - struct kvm_device_ops *ops; + const struct kvm_device_ops *ops; struct kvm *kvm; void *private; struct list_head vm_node; @@ -1315,7 +1315,7 @@ struct kvm_device_ops { void kvm_device_get(struct kvm_device *dev); void kvm_device_put(struct kvm_device *dev); struct kvm_device *kvm_device_from_filp(struct file *filp); -int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type); +int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type); void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; -- cgit v1.2.3 From 58772e9a3db72d032eeb12bc011bc5184a3925f4 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 21 Oct 2019 16:28:20 +0100 Subject: KVM: arm64: Provide VCPU attributes for stolen time Allow user space to inform the KVM host where in the physical memory map the paravirtualized time structures should be located. User space can set an attribute on the VCPU providing the IPA base address of the stolen time structure for that VCPU. This must be repeated for every VCPU in the VM. The address is given in terms of the physical address visible to the guest and must be 64 byte aligned. The guest will discover the address via a hypercall. Signed-off-by: Steven Price Signed-off-by: Marc Zyngier --- include/uapi/linux/kvm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52641d8ca9e8..a540c8357049 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1227,6 +1227,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_XIVE, #define KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_XIVE + KVM_DEV_TYPE_ARM_PV_TIME, +#define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_MAX, }; -- cgit v1.2.3 From 541625ac47ce9d0835efaee0fcbaa251b0000a37 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 21 Oct 2019 16:28:21 +0100 Subject: arm/arm64: Provide a wrapper for SMCCC 1.1 calls SMCCC 1.1 calls may use either HVC or SMC depending on the PSCI conduit. Rather than coding this in every call site, provide a macro which uses the correct instruction. The macro also handles the case where no conduit is configured/available returning a not supported error in res, along with returning the conduit used for the call. This allow us to remove some duplicated code and will be useful later when adding paravirtualized time hypervisor calls. Signed-off-by: Steven Price Acked-by: Will Deacon Signed-off-by: Marc Zyngier --- include/linux/arm-smccc.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 92e0046ce7a7..59494df0f55b 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -319,6 +319,51 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, #define SMCCC_RET_NOT_SUPPORTED -1 #define SMCCC_RET_NOT_REQUIRED -2 +/* + * Like arm_smccc_1_1* but always returns SMCCC_RET_NOT_SUPPORTED. + * Used when the SMCCC conduit is not defined. The empty asm statement + * avoids compiler warnings about unused variables. + */ +#define __fail_smccc_1_1(...) \ + do { \ + __declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \ + asm ("" __constraints(__count_args(__VA_ARGS__))); \ + if (___res) \ + ___res->a0 = SMCCC_RET_NOT_SUPPORTED; \ + } while (0) + +/* + * arm_smccc_1_1_invoke() - make an SMCCC v1.1 compliant call + * + * This is a variadic macro taking one to eight source arguments, and + * an optional return structure. + * + * @a0-a7: arguments passed in registers 0 to 7 + * @res: result values from registers 0 to 3 + * + * This macro will make either an HVC call or an SMC call depending on the + * current SMCCC conduit. If no valid conduit is available then -1 + * (SMCCC_RET_NOT_SUPPORTED) is returned in @res.a0 (if supplied). + * + * The return value also provides the conduit that was used. + */ +#define arm_smccc_1_1_invoke(...) ({ \ + int method = arm_smccc_1_1_get_conduit(); \ + switch (method) { \ + case SMCCC_CONDUIT_HVC: \ + arm_smccc_1_1_hvc(__VA_ARGS__); \ + break; \ + case SMCCC_CONDUIT_SMC: \ + arm_smccc_1_1_smc(__VA_ARGS__); \ + break; \ + default: \ + __fail_smccc_1_1(__VA_ARGS__); \ + method = SMCCC_CONDUIT_NONE; \ + break; \ + } \ + method; \ + }) + /* Paravirtualised time calls (defined by ARM DEN0057A) */ #define ARM_SMCCC_HV_PV_TIME_FEATURES \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ -- cgit v1.2.3 From e0685fa228fdaf386f82ac0d64b2d6f3e0ddd588 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 21 Oct 2019 16:28:23 +0100 Subject: arm64: Retrieve stolen time as paravirtualized guest Enable paravirtualization features when running under a hypervisor supporting the PV_TIME_ST hypercall. For each (v)CPU, we ask the hypervisor for the location of a shared page which the hypervisor will use to report stolen time to us. We set pv_time_ops to the stolen time function which simply reads the stolen value from the shared page for a VCPU. We guarantee single-copy atomicity using READ_ONCE which means we can also read the stolen time for another VCPU than the currently running one while it is potentially being updated by the hypervisor. Signed-off-by: Steven Price Signed-off-by: Marc Zyngier --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 068793a619ca..89d75edb5750 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -136,6 +136,7 @@ enum cpuhp_state { /* Must be the last timer callback */ CPUHP_AP_DUMMY_TIMER_STARTING, CPUHP_AP_ARM_XEN_STARTING, + CPUHP_AP_ARM_KVMPV_STARTING, CPUHP_AP_ARM_CORESIGHT_STARTING, CPUHP_AP_ARM64_ISNDEP_STARTING, CPUHP_AP_SMPCFD_DYING, -- cgit v1.2.3 From b612e5df4587c934bd056bf05f4a1deca4de4f75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Oct 2019 12:45:37 +0200 Subject: clone3: add CLONE_CLEAR_SIGHAND Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL. Mutually exclusive with CLONE_SIGHAND to not disturb other thread's signal handler. In the spirit of closer cooperation between glibc developers and kernel developers (cf. [2]) this patchset came out of a discussion on the glibc mailing list for improving posix_spawn() (cf. [1], [3], [4]). Kernel support for this feature has been explicitly requested by glibc and I see no reason not to help them with this. The child helper process on Linux posix_spawn must ensure that no signal handlers are enabled, so the signal disposition must be either SIG_DFL or SIG_IGN. However, it requires a sigprocmask to obtain the current signal mask and at least _NSIG sigaction calls to reset the signal handlers for each posix_spawn call or complex state tracking that might lead to data corruption in glibc. Adding this flags lets glibc avoid these problems. [1]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00149.html [3]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00158.html [4]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00160.html [2]: https://lwn.net/Articles/799331/ '[...] by asking for better cooperation with the C-library projects in general. They should be copied on patches containing ABI changes, for example. I noted that there are often times where C-library developers wish the kernel community had done things differently; how could those be avoided in the future? Members of the audience suggested that more glibc developers should perhaps join the linux-api list. The other suggestion was to "copy Florian on everything".' Cc: Florian Weimer Cc: libc-alpha@sourceware.org Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191014104538.3096-1-christian.brauner@ubuntu.com --- include/uapi/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 99335e1f4a27..1d500ed03c63 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -33,6 +33,9 @@ #define CLONE_NEWNET 0x40000000 /* New network namespace */ #define CLONE_IO 0x80000000 /* Clone io context */ +/* Flags for the clone3() syscall. */ +#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ + #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall -- cgit v1.2.3 From 1565bdad59e97f31cfc7b065bc0fc77e9549e62d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 9 Oct 2019 16:34:17 -0700 Subject: fscrypt: remove struct fscrypt_ctx Now that ext4 and f2fs implement their own post-read workflow that supports both fscrypt and fsverity, the fscrypt-only workflow based around struct fscrypt_ctx is no longer used. So remove the unused code. This is based on a patch from Chandan Rajendra's "Consolidate FS read I/O callbacks code" patchset, but rebased onto the latest kernel, folded __fscrypt_decrypt_bio() into fscrypt_decrypt_bio(), cleaned up fscrypt_initialize(), and updated the commit message. Originally-from: Chandan Rajendra Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'include') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index f622f7460ed8..04f5ed628445 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -20,7 +20,6 @@ #define FS_CRYPTO_BLOCK_SIZE 16 -struct fscrypt_ctx; struct fscrypt_info; struct fscrypt_str { @@ -64,18 +63,6 @@ struct fscrypt_operations { unsigned int max_namelen; }; -/* Decryption work */ -struct fscrypt_ctx { - union { - struct { - struct bio *bio; - struct work_struct work; - }; - struct list_head free_list; /* Free list */ - }; - u8 flags; /* Flags */ -}; - static inline bool fscrypt_has_encryption_key(const struct inode *inode) { /* pairs with cmpxchg_release() in fscrypt_get_encryption_info() */ @@ -102,8 +89,6 @@ static inline void fscrypt_handle_d_move(struct dentry *dentry) /* crypto.c */ extern void fscrypt_enqueue_decrypt_work(struct work_struct *); -extern struct fscrypt_ctx *fscrypt_get_ctx(gfp_t); -extern void fscrypt_release_ctx(struct fscrypt_ctx *); extern struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, @@ -244,8 +229,6 @@ static inline bool fscrypt_match_name(const struct fscrypt_name *fname, /* bio.c */ extern void fscrypt_decrypt_bio(struct bio *); -extern void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, - struct bio *bio); extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, unsigned int); @@ -295,16 +278,6 @@ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { } -static inline struct fscrypt_ctx *fscrypt_get_ctx(gfp_t gfp_flags) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx) -{ - return; -} - static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, unsigned int offs, @@ -484,11 +457,6 @@ static inline void fscrypt_decrypt_bio(struct bio *bio) { } -static inline void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, - struct bio *bio) -{ -} - static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { -- cgit v1.2.3 From 78958563d8023db0c6d03a2fe2a64d79b47b4349 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Mon, 21 Oct 2019 16:30:22 -0700 Subject: KVM: x86: Remove unneeded kvm_vcpu variable, guest_xcr0_loaded The kvm_vcpu variable, guest_xcr0_loaded, is a waste of an 'int' and a conditional branch. VMX and SVM are the only users, and both unconditionally pair kvm_load_guest_xcr0() with kvm_put_guest_xcr0() making this check unnecessary. Without this variable, the predicates in kvm_load_guest_xcr0 and kvm_put_guest_xcr0 should match. Suggested-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Aaron Lewis Change-Id: I7b1eb9b62969d7bbb2850f27e42f863421641b23 Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 719fc3e15ea4..d2017302996c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -278,7 +278,6 @@ struct kvm_vcpu { struct mutex mutex; struct kvm_run *run; - int guest_xcr0_loaded; struct swait_queue_head wq; struct pid __rcu *pid; int sigset_active; -- cgit v1.2.3 From 149487bdacde32f5a9a344a49533ae0772fb9db7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 21 Oct 2019 15:58:42 -0700 Subject: KVM: Add separate helper for putting borrowed reference to kvm Add a new helper, kvm_put_kvm_no_destroy(), to handle putting a borrowed reference[*] to the VM when installing a new file descriptor fails. KVM expects the refcount to remain valid in this case, as the in-progress ioctl() has an explicit reference to the VM. The primary motiviation for the helper is to document that the 'kvm' pointer is still valid after putting the borrowed reference, e.g. to document that doing mutex(&kvm->lock) immediately after putting a ref to kvm isn't broken. [*] When exposing a new object to userspace via a file descriptor, e.g. a new vcpu, KVM grabs a reference to itself (the VM) prior to making the object visible to userspace to avoid prematurely freeing the VM in the scenario where userspace immediately closes file descriptor. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d2017302996c..a817e446c9aa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -621,6 +621,7 @@ void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); +void kvm_put_kvm_no_destroy(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { -- cgit v1.2.3 From 68bb8ea8ad0d497c28ed47423246b1ab20f26976 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:15 -0400 Subject: net: dsa: use dsa_to_port helper everywhere Do not let the drivers access the ds->ports static array directly while there is a dsa_to_port helper for this purpose. At the same time, un-const this helper since the SJA1105 driver assigns the priv member of the returned dsa_port structure. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 8c3ea0530f65..2e4fe2f8962b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -278,7 +278,7 @@ struct dsa_switch { struct dsa_port ports[]; }; -static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) +static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { return &ds->ports[p]; } -- cgit v1.2.3 From ab8ccae122a41530a89bc899ace0e46defb156a8 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:16 -0400 Subject: net: dsa: add ports list in the switch fabric Add a list of switch ports within the switch fabric. This will help the lookup of a port inside the whole fabric, and it is the first step towards supporting multiple CPU ports, before deprecating the usage of the unique dst->cpu_dp pointer. In preparation for a future allocation of the dsa_port structures, return -ENOMEM in case no structure is returned, even though this error cannot be reached yet. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2e4fe2f8962b..6ff6dfcdc61d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -125,6 +125,9 @@ struct dsa_switch_tree { */ struct dsa_port *cpu_dp; + /* List of switch ports */ + struct list_head ports; + /* * Data for the individual switch chips. */ @@ -195,6 +198,8 @@ struct dsa_port { struct work_struct xmit_work; struct sk_buff_head xmit_queue; + struct list_head list; + /* * Give the switch driver somewhere to hang its per-port private data * structures (accessible from the tagger). -- cgit v1.2.3 From b96ddf254b09447c6b79632cdc02dae3f2454a82 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:17 -0400 Subject: net: dsa: use ports list in dsa_to_port Use the new ports list instead of accessing the dsa_switch array of ports in the dsa_to_port helper. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6ff6dfcdc61d..d2b7ee28f3fd 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -285,7 +285,14 @@ struct dsa_switch { static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { - return &ds->ports[p]; + struct dsa_switch_tree *dst = ds->dst; + struct dsa_port *dp = NULL; + + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds == ds && dp->index == p) + break; + + return dp; } static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) -- cgit v1.2.3 From fb35c60cbacc67a6075fb8e3d98fa348665662fe Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:19 -0400 Subject: net: dsa: use ports list to setup switches Use the new ports list instead of iterating over switches and their ports when setting up the switches and their ports. At the same time, provide setup states and messages for ports and switches as it is done for the trees. Signed-off-by: Vivien Didelot Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index d2b7ee28f3fd..bd08bdee8341 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -215,9 +215,13 @@ struct dsa_port { * Original copy of the master netdev net_device_ops */ const struct net_device_ops *orig_ndo_ops; + + bool setup; }; struct dsa_switch { + bool setup; + struct device *dev; /* -- cgit v1.2.3 From da4561cda2ea6240fc61442eeb2acc47e2e0cae3 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:24 -0400 Subject: net: dsa: use ports list to setup default CPU port Use the new ports list instead of iterating over switches and their ports when setting up the default CPU port. Unassign it on teardown. Now that we can iterate over multiple CPU ports, remove dst->cpu_dp. At the same time, provide a better error message for CPU-less tree. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index bd08bdee8341..f572134eb5de 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -120,11 +120,6 @@ struct dsa_switch_tree { */ struct dsa_platform_data *pd; - /* - * The switch port to which the CPU is attached. - */ - struct dsa_port *cpu_dp; - /* List of switch ports */ struct list_head ports; -- cgit v1.2.3 From 05f294a852358a46d9236cc777901f49a4f0ae85 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:29 -0400 Subject: net: dsa: allocate ports on touch Allocate the struct dsa_port the first time it is accessed with dsa_port_touch, and remove the static dsa_port array from the dsa_switch structure. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index f572134eb5de..9bc1d3f71f89 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -277,9 +277,7 @@ struct dsa_switch { */ bool vlan_filtering; - /* Dynamically allocated ports, keep last */ size_t num_ports; - struct dsa_port ports[]; }; static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) -- cgit v1.2.3 From 7e99e34701728d54ccd0466eccf377a42b9db215 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 21 Oct 2019 16:51:30 -0400 Subject: net: dsa: remove dsa_switch_alloc helper Now that ports are dynamically listed in the fabric, there is no need to provide a special helper to allocate the dsa_switch structure. This will give more flexibility to drivers to embed this structure as they wish in their private structure. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9bc1d3f71f89..e3c14dc3bab9 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -577,7 +577,6 @@ static inline bool dsa_can_decode(const struct sk_buff *skb, return false; } -struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n); void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); #ifdef CONFIG_PM_SLEEP -- cgit v1.2.3 From 5e5b03d163e15a40b0fa57c70b4e8edd549b0b98 Mon Sep 17 00:00:00 2001 From: "Ben Dooks (Codethink)" Date: Tue, 22 Oct 2019 13:59:25 +0100 Subject: xdp: Fix type of string pointer in __XDP_ACT_SYM_TAB The table entry in __XDP_ACT_SYM_TAB for the last item is set to { -1, 0 } where it should be { -1, NULL } as the second item is a pointer to a string. Fixes the following sparse warnings: ./include/trace/events/xdp.h:28:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:53:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:82:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:140:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:155:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:190:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:225:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:260:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:318:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:356:1: warning: Using plain integer as NULL pointer ./include/trace/events/xdp.h:390:1: warning: Using plain integer as NULL pointer Signed-off-by: Ben Dooks (Codethink) Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191022125925.10508-1-ben.dooks@codethink.co.uk --- include/trace/events/xdp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 8c8420230a10..c7e3c9c5bad3 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -22,7 +22,7 @@ #define __XDP_ACT_SYM_FN(x) \ { XDP_##x, #x }, #define __XDP_ACT_SYM_TAB \ - __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, 0 } + __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, NULL } __XDP_ACT_MAP(__XDP_ACT_TP_FN) TRACE_EVENT(xdp_exception, -- cgit v1.2.3 From cbb79863fc3175ed5ac506465948b02a893a8235 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Mon, 14 Oct 2019 10:35:56 -0500 Subject: ipmi: Don't allow device module unload when in use If something has the IPMI driver open, don't allow the device module to be unloaded. Before it would unload and the user would get errors on use. This change is made on user request, and it makes it consistent with the I2C driver, which has the same behavior. It does change things a little bit with respect to kernel users. If the ACPI or IPMI watchdog (or any other kernel user) has created a user, then the device module cannot be unloaded. Before it could be unloaded, This does not affect hot-plug. If the device goes away (it's on something removable that is removed or is hot-removed via sysfs) then it still behaves as it did before. Reported-by: tony camuso Signed-off-by: Corey Minyard Tested-by: tony camuso --- include/linux/ipmi_smi.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 4dc66157d872..deec18b8944a 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -224,10 +224,14 @@ static inline int ipmi_demangle_device_id(uint8_t netfn, uint8_t cmd, * is called, and the lower layer must get the interface from that * call. */ -int ipmi_register_smi(const struct ipmi_smi_handlers *handlers, - void *send_info, - struct device *dev, - unsigned char slave_addr); +int ipmi_add_smi(struct module *owner, + const struct ipmi_smi_handlers *handlers, + void *send_info, + struct device *dev, + unsigned char slave_addr); + +#define ipmi_register_smi(handlers, send_info, dev, slave_addr) \ + ipmi_add_smi(THIS_MODULE, handlers, send_info, dev, slave_addr) /* * Remove a low-level interface from the IPMI driver. This will -- cgit v1.2.3 From 4b97ba73dcdc24fd968cbeb970ae57212e2c1c73 Mon Sep 17 00:00:00 2001 From: Jethro Beekman Date: Wed, 4 Sep 2019 01:15:24 +0000 Subject: mtd: spi-nor: intel-spi: add support for Intel Cannon Lake SPI flash Now that SPI flash controllers without a software sequencer are supported, it's trivial to add support for CNL and its PCI ID. Values from https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/300-series-chipset-pch-datasheet-vol-2.pdf Signed-off-by: Jethro Beekman Reviewed-by: Mika Westerberg Signed-off-by: Tudor Ambarus --- include/linux/platform_data/intel-spi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_data/intel-spi.h b/include/linux/platform_data/intel-spi.h index ebb4f332588b..7f53a5c6f35e 100644 --- a/include/linux/platform_data/intel-spi.h +++ b/include/linux/platform_data/intel-spi.h @@ -13,6 +13,7 @@ enum intel_spi_type { INTEL_SPI_BYT = 1, INTEL_SPI_LPT, INTEL_SPI_BXT, + INTEL_SPI_CNL, }; /** -- cgit v1.2.3 From 45397787536434648495f7b02a7e669ab8ae12f3 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Tue, 24 Sep 2019 07:45:53 +0000 Subject: mtd: spi-nor: Introduce 'struct spi_nor_controller_ops' Move all SPI NOR controller driver specific ops in a dedicated structure. 'struct spi_nor' becomes lighter. Use size_t for lengths in 'int (*write_reg)()' and 'int (*read_reg)()'. Rename wite/read_buf to buf, the name of the functions are suggestive enough. Constify buf in int (*write_reg). Comply with these changes in the SPI NOR controller drivers. Suggested-by: Boris Brezillon Signed-off-by: Tudor Ambarus Reviewed-by: Boris Brezillon --- include/linux/mtd/spi-nor.h | 51 ++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index fc0b4b19c900..d1d736d3c8ab 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -465,6 +465,34 @@ enum spi_nor_pp_command_index { /* Forward declaration that will be used in 'struct spi_nor_flash_parameter' */ struct spi_nor; +/** + * struct spi_nor_controller_ops - SPI NOR controller driver specific + * operations. + * @prepare: [OPTIONAL] do some preparations for the + * read/write/erase/lock/unlock operations. + * @unprepare: [OPTIONAL] do some post work after the + * read/write/erase/lock/unlock operations. + * @read_reg: read out the register. + * @write_reg: write data to the register. + * @read: read data from the SPI NOR. + * @write: write data to the SPI NOR. + * @erase: erase a sector of the SPI NOR at the offset @offs; if + * not provided by the driver, spi-nor will send the erase + * opcode via write_reg(). + */ +struct spi_nor_controller_ops { + int (*prepare)(struct spi_nor *nor, enum spi_nor_ops ops); + void (*unprepare)(struct spi_nor *nor, enum spi_nor_ops ops); + int (*read_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, size_t len); + int (*write_reg)(struct spi_nor *nor, u8 opcode, const u8 *buf, + size_t len); + + ssize_t (*read)(struct spi_nor *nor, loff_t from, size_t len, u8 *buf); + ssize_t (*write)(struct spi_nor *nor, loff_t to, size_t len, + const u8 *buf); + int (*erase)(struct spi_nor *nor, loff_t offs); +}; + /** * struct spi_nor_locking_ops - SPI NOR locking methods * @lock: lock a region of the SPI NOR. @@ -549,17 +577,7 @@ struct flash_info; * @read_proto: the SPI protocol for read operations * @write_proto: the SPI protocol for write operations * @reg_proto the SPI protocol for read_reg/write_reg/erase operations - * @prepare: [OPTIONAL] do some preparations for the - * read/write/erase/lock/unlock operations - * @unprepare: [OPTIONAL] do some post work after the - * read/write/erase/lock/unlock operations - * @read_reg: [DRIVER-SPECIFIC] read out the register - * @write_reg: [DRIVER-SPECIFIC] write data to the register - * @read: [DRIVER-SPECIFIC] read data from the SPI NOR - * @write: [DRIVER-SPECIFIC] write data to the SPI NOR - * @erase: [DRIVER-SPECIFIC] erase a sector of the SPI NOR - * at the offset @offs; if not provided by the driver, - * spi-nor will send the erase opcode via write_reg() + * @controller_ops: SPI NOR controller driver specific operations. * @clear_sr_bp: [FLASH-SPECIFIC] clears the Block Protection Bits from * the SPI NOR Status Register. * @params: [FLASH-SPECIFIC] SPI-NOR flash parameters and settings. @@ -588,16 +606,7 @@ struct spi_nor { bool sst_write_second; u32 flags; - int (*prepare)(struct spi_nor *nor, enum spi_nor_ops ops); - void (*unprepare)(struct spi_nor *nor, enum spi_nor_ops ops); - int (*read_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len); - int (*write_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len); - - ssize_t (*read)(struct spi_nor *nor, loff_t from, - size_t len, u_char *read_buf); - ssize_t (*write)(struct spi_nor *nor, loff_t to, - size_t len, const u_char *write_buf); - int (*erase)(struct spi_nor *nor, loff_t offs); + const struct spi_nor_controller_ops *controller_ops; int (*clear_sr_bp)(struct spi_nor *nor); struct spi_nor_flash_parameter params; -- cgit v1.2.3 From 71a8a63b9dbdeba8205a37979b81d4fba499d079 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Oct 2019 14:23:55 +0200 Subject: netfilter: nf_flow_table: move priority to struct nf_flowtable Hardware offload needs access to the priority field, store this field in the nf_flowtable object. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + include/net/netfilter/nf_tables.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index b37a7d608134..158514281a75 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -24,6 +24,7 @@ struct nf_flowtable_type { struct nf_flowtable { struct list_head list; struct rhashtable rhashtable; + int priority; const struct nf_flowtable_type *type; struct delayed_work gc_work; }; diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 001d294edf57..d529dfb5aa64 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1155,7 +1155,6 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @table: the table the flow table is contained in * @name: name of this flow table * @hooknum: hook number - * @priority: hook priority * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table @@ -1169,7 +1168,6 @@ struct nft_flowtable { struct nft_table *table; char *name; int hooknum; - int priority; int ops_len; u32 genmask:2, use:30; -- cgit v1.2.3 From 3f0465a9ef02624e0a36db9e7c9bedcafcd6f6fe Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Oct 2019 14:24:01 +0200 Subject: netfilter: nf_tables: dynamically allocate hooks per net_device in flowtables Use a list of hooks per device instead an array. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index d529dfb5aa64..7a2ac82ee0ad 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -963,6 +963,12 @@ struct nft_stats { struct u64_stats_sync syncp; }; +struct nft_hook { + struct list_head list; + struct nf_hook_ops ops; + struct rcu_head rcu; +}; + /** * struct nft_base_chain - nf_tables base chain * @@ -1173,7 +1179,7 @@ struct nft_flowtable { use:30; u64 handle; /* runtime data below here */ - struct nf_hook_ops *ops ____cacheline_aligned; + struct list_head hook_list ____cacheline_aligned; struct nf_flowtable data; }; -- cgit v1.2.3 From cb662ac6711f7135618526221498ebfae155531a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Oct 2019 14:29:47 +0200 Subject: netfilter: nf_tables: increase maximum devices number per flowtable Rise the maximum limit of devices per flowtable up to 256. Rename NFT_FLOWTABLE_DEVICE_MAX to NFT_NETDEVICE_MAX in preparation to reuse the netdev hook parser for ingress basechain. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 7a2ac82ee0ad..3d71070e747a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1152,7 +1152,7 @@ struct nft_object_ops { int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); -#define NFT_FLOWTABLE_DEVICE_MAX 8 +#define NFT_NETDEVICE_MAX 256 /** * struct nft_flowtable - nf_tables flow table -- cgit v1.2.3 From d54725cd11a57c30f650260cfb0a92c268bdc3e0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Oct 2019 14:30:05 +0200 Subject: netfilter: nf_tables: support for multiple devices per netdev hook This patch allows you to register one netdev basechain to multiple devices. This adds a new NFTA_HOOK_DEVS netlink attribute to specify the list of netdevices. Basechains store a list of hooks. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3d71070e747a..5bf569e1173b 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -973,21 +973,21 @@ struct nft_hook { * struct nft_base_chain - nf_tables base chain * * @ops: netfilter hook ops + * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy * @stats: per-cpu chain stats * @chain: the chain - * @dev_name: device name that this base chain is attached to (if any) * @flow_block: flow block (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; + struct list_head hook_list; const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; struct nft_chain chain; - char dev_name[IFNAMSIZ]; struct flow_block flow_block; }; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index ed8881ad18ed..81fed16fe2b2 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -144,12 +144,14 @@ enum nft_list_attributes { * @NFTA_HOOK_HOOKNUM: netfilter hook number (NLA_U32) * @NFTA_HOOK_PRIORITY: netfilter hook priority (NLA_U32) * @NFTA_HOOK_DEV: netdevice name (NLA_STRING) + * @NFTA_HOOK_DEVS: list of netdevices (NLA_NESTED) */ enum nft_hook_attributes { NFTA_HOOK_UNSPEC, NFTA_HOOK_HOOKNUM, NFTA_HOOK_PRIORITY, NFTA_HOOK_DEV, + NFTA_HOOK_DEVS, __NFTA_HOOK_MAX }; #define NFTA_HOOK_MAX (__NFTA_HOOK_MAX - 1) -- cgit v1.2.3 From a3470c1829c0c856a19c10af58f8e7792ae27d7a Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Wed, 23 Oct 2019 10:00:46 +0300 Subject: spi: document CS setup, hold & inactive times in header This change documents the CS setup, host & inactive times. They were omitted when the fields were added, and were caught by one of the build bots. Fixes: 25093bdeb6bc ("spi: implement SW control for CS times") Reported-by: kbuild test robot Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20191023070046.12478-1-alexandru.ardelean@analog.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index c40d6af2bf07..98fe8663033a 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -407,6 +407,11 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * controller has native support for memory like operations. * @unprepare_message: undo any work done by prepare_message(). * @slave_abort: abort the ongoing transfer request on an SPI slave controller + * @cs_setup: delay to be introduced by the controller after CS is asserted + * @cs_hold: delay to be introduced by the controller before CS is deasserted + * @cs_inactive: delay to be introduced by the controller after CS is + * deasserted. If @cs_change_delay is used from @spi_transfer, then the + * two delays will be added up. * @cs_gpios: LEGACY: array of GPIO descs to use as chip select lines; one per * CS number. Any individual value may be -ENOENT for CS lines that * are not GPIOs (driven by the SPI controller itself). Use the cs_gpiods -- cgit v1.2.3 From fa6e98cee558622565c97924e922b97340aeabd8 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 22 Oct 2019 11:31:07 -0700 Subject: net: phy: add support for clause 37 auto-negotiation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for clause 37 1000Base-X auto-negotiation. Signed-off-by: Heiner Kallweit Signed-off-by: Tao Ren Tested-by: René van Dorst Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 9a0e981df502..78436d58ce7c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1106,6 +1106,10 @@ int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, u16 regnum, u16 val); +/* Clause 37 */ +int genphy_c37_config_aneg(struct phy_device *phydev); +int genphy_c37_read_status(struct phy_device *phydev); + /* Clause 45 PHY */ int genphy_c45_restart_aneg(struct phy_device *phydev); int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart); -- cgit v1.2.3 From b9bcb95315febd09419ab870ddc7cb98a393f9d0 Mon Sep 17 00:00:00 2001 From: Tao Ren Date: Tue, 22 Oct 2019 11:31:08 -0700 Subject: net: phy: broadcom: add 1000Base-X support for BCM54616S The BCM54616S PHY cannot work properly in RGMII->1000Base-X mode, mainly because genphy functions are designed for copper links, and 1000Base-X (clause 37) auto negotiation needs to be handled differently. This patch enables 1000Base-X support for BCM54616S by customizing 3 driver callbacks, and it's verified to be working on Facebook CMM BMC platform (RGMII->1000Base-KX): - probe: probe callback detects PHY's operation mode based on INTERF_SEL[1:0] pins and 1000X/100FX selection bit in SerDES 100-FX Control register. - config_aneg: calls genphy_c37_config_aneg when the PHY is running in 1000Base-X mode; otherwise, genphy_config_aneg will be called. - read_status: calls genphy_c37_read_status when the PHY is running in 1000Base-X mode; otherwise, genphy_read_status will be called. Note: BCM54616S PHY can also be configured in RGMII->100Base-FX mode, and 100Base-FX support is not available as of now. Signed-off-by: Tao Ren Acked-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 6db2d9a6e503..b475e7f20d28 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -200,9 +200,15 @@ #define BCM5482_SHD_SSD 0x14 /* 10100: Secondary SerDes control */ #define BCM5482_SHD_SSD_LEDM 0x0008 /* SSD LED Mode enable */ #define BCM5482_SHD_SSD_EN 0x0001 /* SSD enable */ -#define BCM5482_SHD_MODE 0x1f /* 11111: Mode Control Register */ -#define BCM5482_SHD_MODE_1000BX 0x0001 /* Enable 1000BASE-X registers */ +/* 10011: SerDes 100-FX Control Register */ +#define BCM54616S_SHD_100FX_CTRL 0x13 +#define BCM54616S_100FX_MODE BIT(0) /* 100-FX SerDes Enable */ + +/* 11111: Mode Control Register */ +#define BCM54XX_SHD_MODE 0x1f +#define BCM54XX_SHD_INTF_SEL_MASK GENMASK(2, 1) /* INTERF_SEL[1:0] */ +#define BCM54XX_SHD_MODE_1000BX BIT(0) /* Enable 1000-X registers */ /* * EXPANSION SHADOW ACCESS REGISTERS. (PHY REG 0x15, 0x16, and 0x17) -- cgit v1.2.3 From 3820729160440158a014add69cc0d371061a96b2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Oct 2019 17:18:11 -0700 Subject: bpf: Prepare btf_ctx_access for non raw_tp use case This patch makes a few changes to btf_ctx_access() to prepare it for non raw_tp use case where the attach_btf_id is not necessary a BTF_KIND_TYPEDEF. It moves the "btf_trace_" prefix check and typedef-follow logic to a new function "check_attach_btf_id()" which is called only once during bpf_check(). btf_ctx_access() only operates on a BTF_KIND_FUNC_PROTO type now. That should also be more efficient since it is done only one instead of every-time check_ctx_access() is called. "check_attach_btf_id()" needs to find the func_proto type from the attach_btf_id. It needs to store the result into the newly added prog->aux->attach_func_proto. func_proto btf type has no name, so a proper name should be stored into "attach_func_name" also. v2: - Move the "btf_trace_" check to an earlier verifier phase (Alexei) Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191025001811.1718491-1-kafai@fb.com --- include/linux/bpf.h | 5 +++++ include/linux/btf.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2c2c29b49845..171be30fe0ae 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -392,6 +392,11 @@ struct bpf_prog_aux { u32 attach_btf_id; /* in-kernel BTF type id to attach to */ bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; + bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ + const struct btf_type *attach_func_proto; + /* function name for valid attach_btf_id */ + const char *attach_func_name; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct latch_tree_node ksym_tnode; diff --git a/include/linux/btf.h b/include/linux/btf.h index 55d43bc856be..9dee00859c5f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -5,6 +5,7 @@ #define _LINUX_BTF_H 1 #include +#include struct btf; struct btf_member; @@ -53,6 +54,36 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); +static inline bool btf_type_is_ptr(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; +} + +static inline bool btf_type_is_int(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_INT; +} + +static inline bool btf_type_is_enum(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; +} + +static inline bool btf_type_is_typedef(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF; +} + +static inline bool btf_type_is_func(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; +} + +static inline bool btf_type_is_func_proto(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; +} + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); -- cgit v1.2.3 From 5153faac18d293fc7abb19ff7034683fbcd82dc7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Oct 2019 12:03:51 -0700 Subject: cgroup: remove cgroup_enable_task_cg_lists() optimization cgroup_enable_task_cg_lists() is used to lazyily initialize task cgroup associations on the first use to reduce fork / exit overheads on systems which don't use cgroup. Unfortunately, locking around it has never been actually correct and its value is dubious given how the vast majority of systems use cgroup right away from boot. This patch removes the optimization. For now, replace the cg_list based branches with WARN_ON_ONCE()'s to be on the safe side. We can simplify the logic further in the future. Signed-off-by: Tejun Heo Reported-by: Oleg Nesterov Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3ba3e6da13a6..f6b048902d6c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -150,7 +150,6 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); -void cgroup_enable_task_cg_lists(void); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); -- cgit v1.2.3 From 1d55fdc85799372ab3b0d2a6928e73439f8149aa Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Thu, 17 Oct 2019 22:35:11 +0000 Subject: crypto: ccp - Retry SEV INIT command in case of integrity check failure. SEV INIT command loads the SEV related persistent data from NVS and initializes the platform context. The firmware validates the persistent state. If validation fails, the firmware will reset the persisent state and return an integrity check failure status. At this point, a subsequent INIT command should succeed, so retry the command. The INIT command retry is only done during driver initialization. Additional enums along with SEV_RET_SECURE_DATA_INVALID are added to sev_ret_code to maintain continuity and relevance of enum values. Signed-off-by: Ashish Kalra Acked-by: David Rientjes Reviewed-by: Brijesh Singh Signed-off-by: Herbert Xu --- include/uapi/linux/psp-sev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 592a0c1b77c9..0549a5c622bf 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -58,6 +58,9 @@ typedef enum { SEV_RET_HWSEV_RET_PLATFORM, SEV_RET_HWSEV_RET_UNSAFE, SEV_RET_UNSUPPORTED, + SEV_RET_INVALID_PARAM, + SEV_RET_RESOURCE_LIMIT, + SEV_RET_SECURE_DATA_INVALID, SEV_RET_MAX, } sev_ret_code; -- cgit v1.2.3 From 993e4cdebb5a53bc87f21cdd34d1dc42225de43d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 24 Oct 2019 19:31:10 +0200 Subject: block: reorder bio::__bi_remaining for better packing Simple reordering of __bi_remaining can reduce bio size by 8 bytes that are now wasted on padding (measured on x86_64): struct bio { struct bio * bi_next; /* 0 8 */ struct gendisk * bi_disk; /* 8 8 */ unsigned int bi_opf; /* 16 4 */ short unsigned int bi_flags; /* 20 2 */ short unsigned int bi_ioprio; /* 22 2 */ short unsigned int bi_write_hint; /* 24 2 */ blk_status_t bi_status; /* 26 1 */ u8 bi_partno; /* 27 1 */ /* XXX 4 bytes hole, try to pack */ struct bvec_iter bi_iter; /* 32 24 */ /* XXX last struct has 4 bytes of padding */ atomic_t __bi_remaining; /* 56 4 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 104, cachelines: 2, members: 19 */ /* sum members: 96, holes: 2, sum holes: 8 */ /* paddings: 1, sum paddings: 4 */ /* last cacheline: 40 bytes */ }; Now becomes: struct bio { struct bio * bi_next; /* 0 8 */ struct gendisk * bi_disk; /* 8 8 */ unsigned int bi_opf; /* 16 4 */ short unsigned int bi_flags; /* 20 2 */ short unsigned int bi_ioprio; /* 22 2 */ short unsigned int bi_write_hint; /* 24 2 */ blk_status_t bi_status; /* 26 1 */ u8 bi_partno; /* 27 1 */ atomic_t __bi_remaining; /* 28 4 */ struct bvec_iter bi_iter; /* 32 24 */ /* XXX last struct has 4 bytes of padding */ [...] /* size: 96, cachelines: 2, members: 19 */ /* paddings: 1, sum paddings: 4 */ /* last cacheline: 32 bytes */ }; Signed-off-by: David Sterba Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d688b96d1d63..1e7eeec16458 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -153,10 +153,10 @@ struct bio { unsigned short bi_write_hint; blk_status_t bi_status; u8 bi_partno; + atomic_t __bi_remaining; struct bvec_iter bi_iter; - atomic_t __bi_remaining; bio_end_io_t *bi_end_io; void *bi_private; -- cgit v1.2.3 From d386732bc142c63b9f676fed098bc06f91ee964a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 21 Oct 2019 21:07:24 -0300 Subject: blk-mq: fill header with kernel-doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Insert documentation for structs, enums and functions at header file. Format existing and new comments at struct blk_mq_ops as kernel-doc comments. Reviewed-by: Bart Van Assche Signed-off-by: André Almeida Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 225 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 185 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e0fce93ac127..4919d22e1aff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -10,74 +10,171 @@ struct blk_mq_tags; struct blk_flush_queue; /** - * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware block device + * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware + * block device */ struct blk_mq_hw_ctx { struct { + /** @lock: Protects the dispatch list. */ spinlock_t lock; + /** + * @dispatch: Used for requests that are ready to be + * dispatched to the hardware but for some reason (e.g. lack of + * resources) could not be sent to the hardware. As soon as the + * driver can send new requests, requests at this list will + * be sent first for a fairer dispatch. + */ struct list_head dispatch; - unsigned long state; /* BLK_MQ_S_* flags */ + /** + * @state: BLK_MQ_S_* flags. Defines the state of the hw + * queue (active, scheduled to restart, stopped). + */ + unsigned long state; } ____cacheline_aligned_in_smp; + /** + * @run_work: Used for scheduling a hardware queue run at a later time. + */ struct delayed_work run_work; + /** @cpumask: Map of available CPUs where this hctx can run. */ cpumask_var_t cpumask; + /** + * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU + * selection from @cpumask. + */ int next_cpu; + /** + * @next_cpu_batch: Counter of how many works left in the batch before + * changing to the next CPU. + */ int next_cpu_batch; - unsigned long flags; /* BLK_MQ_F_* flags */ + /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ + unsigned long flags; + /** + * @sched_data: Pointer owned by the IO scheduler attached to a request + * queue. It's up to the IO scheduler how to use this pointer. + */ void *sched_data; + /** + * @queue: Pointer to the request queue that owns this hardware context. + */ struct request_queue *queue; + /** @fq: Queue of requests that need to perform a flush operation. */ struct blk_flush_queue *fq; + /** + * @driver_data: Pointer to data owned by the block driver that created + * this hctx + */ void *driver_data; + /** + * @ctx_map: Bitmap for each software queue. If bit is on, there is a + * pending request in that software queue. + */ struct sbitmap ctx_map; + /** + * @dispatch_from: Software queue to be used when no scheduler was + * selected. + */ struct blk_mq_ctx *dispatch_from; + /** + * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to + * decide if the hw_queue is busy using Exponential Weighted Moving + * Average algorithm. + */ unsigned int dispatch_busy; + /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ unsigned short type; + /** @nr_ctx: Number of software queues. */ unsigned short nr_ctx; + /** @ctxs: Array of software queues. */ struct blk_mq_ctx **ctxs; + /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ spinlock_t dispatch_wait_lock; + /** + * @dispatch_wait: Waitqueue to put requests when there is no tag + * available at the moment, to wait for another try in the future. + */ wait_queue_entry_t dispatch_wait; + + /** + * @wait_index: Index of next available dispatch_wait queue to insert + * requests. + */ atomic_t wait_index; + /** + * @tags: Tags owned by the block driver. A tag at this set is only + * assigned when a request is dispatched from a hardware queue. + */ struct blk_mq_tags *tags; + /** + * @sched_tags: Tags owned by I/O scheduler. If there is an I/O + * scheduler associated with a request queue, a tag is assigned when + * that request is allocated. Else, this member is not used. + */ struct blk_mq_tags *sched_tags; + /** @queued: Number of queued requests. */ unsigned long queued; + /** @run: Number of dispatched requests. */ unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 + /** @dispatched: Number of dispatch requests by queue. */ unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; + /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; + /** @queue_num: Index of this hardware queue. */ unsigned int queue_num; + /** + * @nr_active: Number of active requests. Only used when a tag set is + * shared across request queues. + */ atomic_t nr_active; + /** @cpuhp_dead: List to store request if some CPU die. */ struct hlist_node cpuhp_dead; + /** @kobj: Kernel object for sysfs. */ struct kobject kobj; + /** @poll_considered: Count times blk_poll() was called. */ unsigned long poll_considered; + /** @poll_invoked: Count how many requests blk_poll() polled. */ unsigned long poll_invoked; + /** @poll_success: Count how many polled requests were completed. */ unsigned long poll_success; #ifdef CONFIG_BLK_DEBUG_FS + /** + * @debugfs_dir: debugfs directory for this hardware queue. Named + * as cpu. + */ struct dentry *debugfs_dir; + /** @sched_debugfs_dir: debugfs directory for the scheduler. */ struct dentry *sched_debugfs_dir; #endif + /** @hctx_list: List of all hardware queues. */ struct list_head hctx_list; - /* Must be the last member - see also blk_mq_hw_ctx_size(). */ + /** + * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is + * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also + * blk_mq_hw_ctx_size(). + */ struct srcu_struct srcu[0]; }; /** - * struct blk_mq_queue_map - ctx -> hctx mapping + * struct blk_mq_queue_map - Map software queues to hardware queues * @mq_map: CPU ID to hardware queue index map. This is an array * with nr_cpu_ids elements. Each element has a value in the range * [@queue_offset, @queue_offset + @nr_queues). @@ -92,10 +189,17 @@ struct blk_mq_queue_map { unsigned int queue_offset; }; +/** + * enum hctx_type - Type of hardware queue + * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. + * @HCTX_TYPE_READ: Just for READ I/O. + * @HCTX_TYPE_POLL: Polled I/O of any kind. + * @HCTX_MAX_TYPES: Number of types of hctx. + */ enum hctx_type { - HCTX_TYPE_DEFAULT, /* all I/O not otherwise accounted for */ - HCTX_TYPE_READ, /* just for READ I/O */ - HCTX_TYPE_POLL, /* polled I/O of any kind */ + HCTX_TYPE_DEFAULT, + HCTX_TYPE_READ, + HCTX_TYPE_POLL, HCTX_MAX_TYPES, }; @@ -147,6 +251,12 @@ struct blk_mq_tag_set { struct list_head tag_list; }; +/** + * struct blk_mq_queue_data - Data about a request inserted in a queue + * + * @rq: Request pointer. + * @last: If it is the last request in the queue. + */ struct blk_mq_queue_data { struct request *rq; bool last; @@ -174,81 +284,101 @@ typedef bool (busy_fn)(struct request_queue *); typedef void (complete_fn)(struct request *); typedef void (cleanup_rq_fn)(struct request *); - +/** + * struct blk_mq_ops - Callback functions that implements block driver + * behaviour. + */ struct blk_mq_ops { - /* - * Queue request + /** + * @queue_rq: Queue a new request from block IO. */ queue_rq_fn *queue_rq; - /* - * If a driver uses bd->last to judge when to submit requests to - * hardware, it must define this function. In case of errors that - * make us stop issuing further requests, this hook serves the + /** + * @commit_rqs: If a driver uses bd->last to judge when to submit + * requests to hardware, it must define this function. In case of errors + * that make us stop issuing further requests, this hook serves the * purpose of kicking the hardware (which the last request otherwise * would have done). */ commit_rqs_fn *commit_rqs; - /* - * Reserve budget before queue request, once .queue_rq is + /** + * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the * reserved budget. Also we have to handle failure case * of .get_budget for avoiding I/O deadlock. */ get_budget_fn *get_budget; + /** + * @put_budget: Release the reserved budget. + */ put_budget_fn *put_budget; - /* - * Called on request timeout + /** + * @timeout: Called on request timeout. */ timeout_fn *timeout; - /* - * Called to poll for completion of a specific tag. + /** + * @poll: Called to poll for completion of a specific tag. */ poll_fn *poll; + /** + * @complete: Mark the request as complete. + */ complete_fn *complete; - /* - * Called when the block layer side of a hardware queue has been - * set up, allowing the driver to allocate/init matching structures. - * Ditto for exit/teardown. + /** + * @init_hctx: Called when the block layer side of a hardware queue has + * been set up, allowing the driver to allocate/init matching + * structures. */ init_hctx_fn *init_hctx; + /** + * @exit_hctx: Ditto for exit/teardown. + */ exit_hctx_fn *exit_hctx; - /* - * Called for every command allocated by the block layer to allow - * the driver to set up driver specific data. + /** + * @init_request: Called for every command allocated by the block layer + * to allow the driver to set up driver specific data. * * Tag greater than or equal to queue_depth is for setting up * flush request. - * - * Ditto for exit/teardown. */ init_request_fn *init_request; + /** + * @exit_request: Ditto for exit/teardown. + */ exit_request_fn *exit_request; - /* Called from inside blk_get_request() */ + + /** + * @initialize_rq_fn: Called from inside blk_get_request(). + */ void (*initialize_rq_fn)(struct request *rq); - /* - * Called before freeing one request which isn't completed yet, - * and usually for freeing the driver private data + /** + * @cleanup_rq: Called before freeing one request which isn't completed + * yet, and usually for freeing the driver private data. */ cleanup_rq_fn *cleanup_rq; - /* - * If set, returns whether or not this queue currently is busy + /** + * @busy: If set, returns whether or not this queue currently is busy. */ busy_fn *busy; + /** + * @map_queues: This allows drivers specify their own queue mapping by + * overriding the setup-time function that builds the mq_map. + */ map_queues_fn *map_queues; #ifdef CONFIG_BLK_DEBUG_FS - /* - * Used by the debugfs implementation to show driver-specific + /** + * @show_rq: Used by the debugfs implementation to show driver-specific * information about a request. */ void (*show_rq)(struct seq_file *m, struct request *rq); @@ -391,14 +521,29 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q); unsigned int blk_mq_rq_cpu(struct request *rq); -/* +/** + * blk_mq_rq_from_pdu - cast a PDU to a request + * @pdu: the PDU (Protocol Data Unit) to be casted + * + * Return: request + * * Driver command data is immediately after the request. So subtract request - * size to get back to the original request, add request size to get the PDU. + * size to get back to the original request. */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { return pdu - sizeof(struct request); } + +/** + * blk_mq_rq_to_pdu - cast a request to a PDU + * @rq: the request to be casted + * + * Return: pointer to the PDU + * + * Driver command data is immediately after the request. So add request to get + * the PDU. + */ static inline void *blk_mq_rq_to_pdu(struct request *rq) { return rq + 1; -- cgit v1.2.3 From 9a7f12edf8848a9b355a86fc0183d7be932ef11e Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Fri, 20 Sep 2019 17:58:21 +0200 Subject: fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint name According to commit message in the original commit c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life time hints"), as well as userspace library[1] and man page update[2], R/W hint constants are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained "RWF_*" prefix used in the early versions of the proposed patch set[3]. Rename it and provide the old name as a synonym for the new one for backward compatibility. [1] https://github.com/axboe/fio/commit/bd553af6c849 [2] https://github.com/mkerrisk/man-pages/commit/580082a186fd [3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life time hints") Acked-by: Song Liu Signed-off-by: Eugene Syromiatnikov Signed-off-by: Jens Axboe --- include/uapi/linux/fcntl.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 1d338357df8a..1f97b33c840e 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -58,13 +58,20 @@ * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be * used to clear any hints previously set. */ -#define RWF_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NOT_SET 0 #define RWH_WRITE_LIFE_NONE 1 #define RWH_WRITE_LIFE_SHORT 2 #define RWH_WRITE_LIFE_MEDIUM 3 #define RWH_WRITE_LIFE_LONG 4 #define RWH_WRITE_LIFE_EXTREME 5 +/* + * The originally introduced spelling is remained from the first + * versions of the patch set that introduced the feature, see commit + * v4.13-rc1~212^2~51. + */ +#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET + /* * Types of directory notifications that may be requested. */ -- cgit v1.2.3 From 480274787d7e3458bc5a7cfbbbe07033984ad711 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 23 Oct 2019 11:09:26 -0400 Subject: tcp: add TCP_INFO status for failed client TFO The TCPI_OPT_SYN_DATA bit as part of tcpi_options currently reports whether or not data-in-SYN was ack'd on both the client and server side. We'd like to gather more information on the client-side in the failure case in order to indicate the reason for the failure. This can be useful for not only debugging TFO, but also for creating TFO socket policies. For example, if a middle box removes the TFO option or drops a data-in-SYN, we can can detect this case, and turn off TFO for these connections saving the extra retransmits. The newly added tcpi_fastopen_client_fail status is 2 bits and has the following 4 states: 1) TFO_STATUS_UNSPEC Catch-all state which includes when TFO is disabled via black hole detection, which is indicated via LINUX_MIB_TCPFASTOPENBLACKHOLE. 2) TFO_COOKIE_UNAVAILABLE If TFO_CLIENT_NO_COOKIE mode is off, this state indicates that no cookie is available in the cache. 3) TFO_DATA_NOT_ACKED Data was sent with SYN, we received a SYN/ACK but it did not cover the data portion. Cookie is not accepted by server because the cookie may be invalid or the server may be overloaded. 4) TFO_SYN_RETRANSMITTED Data was sent with SYN, we received a SYN/ACK which did not cover the data after at least 1 additional SYN was sent (without data). It may be the case that a middle-box is dropping data-in-SYN packets. Thus, it would be more efficient to not use TFO on this connection to avoid extra retransmits during connection establishment. These new fields do not cover all the cases where TFO may fail, but other failures, such as SYN/ACK + data being dropped, will result in the connection not becoming established. And a connection blackhole after session establishment shows up as a stalled connection. Signed-off-by: Jason Baron Cc: Eric Dumazet Cc: Neal Cardwell Cc: Christoph Paasch Cc: Yuchung Cheng Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- include/uapi/linux/tcp.h | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 668e25a76d69..ca6f01531e64 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -223,7 +223,7 @@ struct tcp_sock { fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ - unused:2; + fastopen_client_fail:2; /* reason why fastopen failed */ u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 81e697978e8b..74af1f759cee 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -155,6 +155,14 @@ enum { TCP_QUEUES_NR, }; +/* why fastopen failed from client perspective */ +enum tcp_fastopen_client_fail { + TFO_STATUS_UNSPEC, /* catch-all */ + TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */ + TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */ + TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */ +}; + /* for TCP_INFO socket option */ #define TCPI_OPT_TIMESTAMPS 1 #define TCPI_OPT_SACK 2 @@ -211,7 +219,7 @@ struct tcp_info { __u8 tcpi_backoff; __u8 tcpi_options; __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; - __u8 tcpi_delivery_rate_app_limited:1; + __u8 tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; __u32 tcpi_rto; __u32 tcpi_ato; -- cgit v1.2.3 From ae4a50ee3151d6cb11c56297699ca9025eb18077 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Fri, 25 Oct 2019 10:36:47 +1300 Subject: mac80211: typo fixes in kerneldoc comments Correct some trivial typos in kerneldoc comments. Signed-off-by: Chris Packham Link: https://lore.kernel.org/r/20191024213647.5507-1-chris.packham@alliedtelesis.co.nz Signed-off-by: Johannes Berg --- include/net/mac80211.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d69081c38788..67866fa1328d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -312,7 +312,7 @@ struct ieee80211_vif_chanctx_switch { * @BSS_CHANGED_KEEP_ALIVE: keep alive options (idle period or protected * keep alive) changed. * @BSS_CHANGED_MCAST_RATE: Multicast Rate setting changed for this interface - * @BSS_CHANGED_FTM_RESPONDER: fime timing reasurement request responder + * @BSS_CHANGED_FTM_RESPONDER: fine timing measurement request responder * functionality changed for this BSS (AP mode). * @BSS_CHANGED_TWT: TWT status changed * @BSS_CHANGED_HE_OBSS_PD: OBSS Packet Detection status changed. @@ -1059,7 +1059,7 @@ struct ieee80211_tx_info { }; /** - * struct ieee80211_tx_status - extended tx staus info for rate control + * struct ieee80211_tx_status - extended tx status info for rate control * * @sta: Station that the packet was transmitted for * @info: Basic tx status information @@ -1702,7 +1702,7 @@ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); * %IEEE80211_KEY_FLAG_SW_MGMT_TX flag to encrypt such frames in SW. * @IEEE80211_KEY_FLAG_GENERATE_IV_MGMT: This flag should be set by the * driver for a CCMP/GCMP key to indicate that is requires IV generation - * only for managment frames (MFP). + * only for management frames (MFP). * @IEEE80211_KEY_FLAG_RESERVE_TAILROOM: This flag should be set by the * driver for a key to indicate that sufficient tailroom must always * be reserved for ICV or MIC, even when HW encryption is enabled. @@ -1998,7 +1998,7 @@ struct ieee80211_sta { * * * If the skb is transmitted as part of a BA agreement, the * A-MSDU maximal size is min(max_amsdu_len, 4065) bytes. - * * If the skb is not part of a BA aggreement, the A-MSDU maximal + * * If the skb is not part of a BA agreement, the A-MSDU maximal * size is min(max_amsdu_len, 7935) bytes. * * Both additional HT limits must be enforced by the low level @@ -3187,13 +3187,13 @@ enum ieee80211_rate_control_changed { * * With the support for multi channel contexts and multi channel operations, * remain on channel operations might be limited/deferred/aborted by other - * flows/operations which have higher priority (and vise versa). + * flows/operations which have higher priority (and vice versa). * Specifying the ROC type can be used by devices to prioritize the ROC * operations compared to other operations/flows. * * @IEEE80211_ROC_TYPE_NORMAL: There are no special requirements for this ROC. * @IEEE80211_ROC_TYPE_MGMT_TX: The remain on channel request is required - * for sending managment frames offchannel. + * for sending management frames offchannel. */ enum ieee80211_roc_type { IEEE80211_ROC_TYPE_NORMAL = 0, @@ -5616,7 +5616,7 @@ void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, /** * ieee80211_iter_chan_contexts_atomic - iterate channel contexts - * @hw: pointre obtained from ieee80211_alloc_hw(). + * @hw: pointer obtained from ieee80211_alloc_hw(). * @iter: iterator function * @iter_data: data passed to iterator function * @@ -6364,7 +6364,7 @@ ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq, * again. * * The API ieee80211_txq_may_transmit() also ensures that TXQ list will be - * aligned aginst driver's own round-robin scheduler list. i.e it rotates + * aligned against driver's own round-robin scheduler list. i.e it rotates * the TXQ list till it makes the requested node becomes the first entry * in TXQ list. Thus both the TXQ list and driver's list are in sync. If this * function returns %true, the driver is expected to schedule packets -- cgit v1.2.3 From 3f2aef10ffad76c31275ae66b1d6e486b22619d6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 24 Oct 2019 11:32:12 -0700 Subject: mac80211: fix a typo of "function" Signed-off-by: Joe Perches Link: https://lore.kernel.org/r/4d53be6c963542878d370ff1a6dc7c3a89b28d23.camel@perches.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 67866fa1328d..f5996960eace 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2626,7 +2626,7 @@ ieee80211_get_alt_retry_rate(const struct ieee80211_hw *hw, * @hw: the hardware * @skb: the skb * - * Free a transmit skb. Use this funtion when some failure + * Free a transmit skb. Use this function when some failure * to transmit happened and thus status cannot be reported. */ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb); -- cgit v1.2.3 From 8e01d9a396e6db153d94a6004e6473d9ff251a6a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 27 Oct 2019 14:41:59 +0000 Subject: KVM: arm64: vgic-v4: Move the GICv4 residency flow to be driven by vcpu_load/put When the VHE code was reworked, a lot of the vgic stuff was moved around, but the GICv4 residency code did stay untouched, meaning that we come in and out of residency on each flush/sync, which is obviously suboptimal. To address this, let's move things around a bit: - Residency entry (flush) moves to vcpu_load - Residency exit (sync) moves to vcpu_put - On blocking (entry to WFI), we "put" - On unblocking (exit from WFI), we "load" Because these can nest (load/block/put/load/unblock/put, for example), we now have per-VPE tracking of the residency state. Additionally, vgic_v4_put gains a "need doorbell" parameter, which only gets set to true when blocking because of a WFI. This allows a finer control of the doorbell, which now also gets disabled as soon as it gets signaled. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20191027144234.8395-2-maz@kernel.org --- include/kvm/arm_vgic.h | 4 ++-- include/linux/irqchip/arm-gic-v4.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index af4f09c02bf1..4dc58d7a0010 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -396,7 +396,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq, int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int irq, struct kvm_kernel_irq_routing_entry *irq_entry); -void kvm_vgic_v4_enable_doorbell(struct kvm_vcpu *vcpu); -void kvm_vgic_v4_disable_doorbell(struct kvm_vcpu *vcpu); +int vgic_v4_load(struct kvm_vcpu *vcpu); +int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db); #endif /* __KVM_ARM_VGIC_H */ diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index e6b155713b47..ab1396afe08a 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -35,6 +35,8 @@ struct its_vpe { /* Doorbell interrupt */ int irq; irq_hw_number_t vpe_db_lpi; + /* VPE resident */ + bool resident; /* VPE proxy mapping */ int vpe_proxy_event; /* -- cgit v1.2.3 From c199ce4f9dd896c716aece33e6750be34aea1151 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 24 Oct 2019 17:22:01 +0200 Subject: net: Fix misspellings of "configure" and "configuration" Fix various misspellings of "configuration" and "configure". Signed-off-by: Geert Uytterhoeven Acked-by: Kalle Valo Signed-off-by: David S. Miller --- include/uapi/linux/dcbnl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index 69df19aa8e72..a791a94013a6 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -286,7 +286,7 @@ struct dcbmsg { * @DCB_CMD_GNUMTCS: get the number of traffic classes currently supported * @DCB_CMD_SNUMTCS: set the number of traffic classes * @DCB_CMD_GBCN: set backward congestion notification configuration - * @DCB_CMD_SBCN: get backward congestion notification configration. + * @DCB_CMD_SBCN: get backward congestion notification configuration. * @DCB_CMD_GAPP: get application protocol configuration * @DCB_CMD_SAPP: set application protocol configuration * @DCB_CMD_IEEE_SET: set IEEE 802.1Qaz configuration -- cgit v1.2.3 From e1b185491f739983b596804953586346e50351c9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 24 Oct 2019 17:23:23 +0200 Subject: net: Fix various misspellings of "connect" Fix misspellings of "disconnect", "disconnecting", "connections", and "disconnected". Signed-off-by: Geert Uytterhoeven Acked-by: Kalle Valo Acked-by: Simon Horman Signed-off-by: David S. Miller --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4ab2c49423dc..ab6850bbba99 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6593,7 +6593,7 @@ struct cfg80211_roam_info { * time it is accessed in __cfg80211_roamed() due to delay in scheduling * rdev->event_work. In case of any failures, the reference is released * either in cfg80211_roamed() or in __cfg80211_romed(), Otherwise, it will be - * released while diconneting from the current bss. + * released while disconnecting from the current bss. */ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, gfp_t gfp); -- cgit v1.2.3 From 6b297524234ccf3954b54609ab6bc2e8c4d3f677 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 25 Oct 2019 01:03:51 +0200 Subject: net: dsa: Add support for devlink device parameters Add plumbing to allow DSA drivers to register parameters with devlink. To keep with the abstraction, the DSA drivers pass the ds structure to these helpers, and the DSA core then translates that to the devlink structure associated to the device. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index e3c14dc3bab9..d5f6e5ccca38 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -550,6 +550,29 @@ struct dsa_switch_ops { */ netdev_tx_t (*port_deferred_xmit)(struct dsa_switch *ds, int port, struct sk_buff *skb); + /* Devlink parameters */ + int (*devlink_param_get)(struct dsa_switch *ds, u32 id, + struct devlink_param_gset_ctx *ctx); + int (*devlink_param_set)(struct dsa_switch *ds, u32 id, + struct devlink_param_gset_ctx *ctx); +}; + +#define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ + DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes, \ + dsa_devlink_param_get, dsa_devlink_param_set, NULL) + +int dsa_devlink_param_get(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx); +int dsa_devlink_param_set(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx); +int dsa_devlink_params_register(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count); +void dsa_devlink_params_unregister(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count); +struct dsa_devlink_priv { + struct dsa_switch *ds; }; struct dsa_switch_driver { -- cgit v1.2.3 From f7907e57aea2adcd0b57ebcca410e125412ab680 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Tue, 29 Oct 2019 01:21:31 +0000 Subject: regulator: fixed: add off-on-delay Depends on board design, the gpio controlling regulator may connects with a big capacitance. When need off, it takes some time to let the regulator to be truly off. If not add enough delay, the regulator might have always been on, so introduce off-on-delay to handle such case. Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1572311875-22880-3-git-send-email-peng.fan@nxp.com Signed-off-by: Mark Brown --- include/linux/regulator/fixed.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/regulator/fixed.h b/include/linux/regulator/fixed.h index d44ce5f18a56..55319943fcc5 100644 --- a/include/linux/regulator/fixed.h +++ b/include/linux/regulator/fixed.h @@ -36,6 +36,7 @@ struct fixed_voltage_config { const char *input_supply; int microvolts; unsigned startup_delay; + unsigned int off_on_delay; unsigned enabled_at_boot:1; struct regulator_init_data *init_data; }; -- cgit v1.2.3 From 9ff624cdbff4466a356892500699aea9318d584e Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Tue, 29 Oct 2019 15:19:17 +0800 Subject: KVM: arm/arm64: vgic: Remove the declaration of kvm_send_userspace_msi() The callsite of kvm_send_userspace_msi() is currently arch agnostic. There seems no reason to keep an extra declaration of it in arm_vgic.h (we already have one in include/linux/kvm_host.h). Remove it. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20191029071919.177-2-yuzenghui@huawei.com --- include/kvm/arm_vgic.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 4dc58d7a0010..f66a02dac8b0 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -378,8 +378,6 @@ static inline int kvm_vgic_get_max_vcpus(void) return kvm_vgic_global_state.max_gic_vcpus; } -int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); - /** * kvm_vgic_setup_default_irq_routing: * Setup a default flat gsi routing table mapping all SPIs -- cgit v1.2.3 From bad36e4e8cdc9048948490293efefdbd85c40ecc Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Tue, 29 Oct 2019 15:19:18 +0800 Subject: KVM: arm/arm64: vgic: Fix some comments typo Fix various comments, including wrong function names, grammar mistakes and specification references. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20191029071919.177-3-yuzenghui@huawei.com --- include/kvm/arm_vgic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index f66a02dac8b0..9d53f545a3d5 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -240,7 +240,7 @@ struct vgic_dist { * Contains the attributes and gpa of the LPI configuration table. * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share * one address across all redistributors. - * GICv3 spec: 6.1.2 "LPI Configuration tables" + * GICv3 spec: IHI 0069E 6.1.1 "LPI Configuration tables" */ u64 propbaser; -- cgit v1.2.3 From c3a31e605620c279163c14068a60869ea3fda203 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Oct 2019 13:59:56 -0600 Subject: io_uring: add support for IORING_REGISTER_FILES_UPDATE Allows the application to remove/replace/add files to/from a file set. Passes in a struct: struct io_uring_files_update { __u32 offset; __s32 *fds; }; that holds an array of fds, size of array passed in through the usual nr_args part of the io_uring_register() system call. The logic is as follows: 1) If ->fds[i] is -1, the existing file at i + ->offset is removed from the set. 2) If ->fds[i] is a valid fd, the existing file at i + ->offset is replaced with ->fds[i]. For case #2, is the existing file is currently empty (fd == -1), the new fd is simply added to the array. Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ea57526a5b89..4f532d9c0554 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -150,5 +150,11 @@ struct io_uring_params { #define IORING_UNREGISTER_FILES 3 #define IORING_REGISTER_EVENTFD 4 #define IORING_UNREGISTER_EVENTFD 5 +#define IORING_REGISTER_FILES_UPDATE 6 + +struct io_uring_files_update { + __u32 offset; + __s32 *fds; +}; #endif -- cgit v1.2.3 From 33a107f0a1b8df0ad925e39d8afc97bb78e0cec1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Oct 2019 12:10:03 -0600 Subject: io_uring: allow application controlled CQ ring size We currently size the CQ ring as twice the SQ ring, to allow some flexibility in not overflowing the CQ ring. This is done because the SQE life time is different than that of the IO request itself, the SQE is consumed as soon as the kernel has seen the entry. Certain application don't need a huge SQ ring size, since they just submit IO in batches. But they may have a lot of requests pending, and hence need a big CQ ring to hold them all. By allowing the application to control the CQ ring size multiplier, we can cater to those applications more efficiently. If an application wants to define its own CQ ring size, it must set IORING_SETUP_CQSIZE in the setup flags, and fill out io_uring_params->cq_entries. The value must be a power of two. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4f532d9c0554..e0137ea6ad79 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -50,6 +50,7 @@ struct io_uring_sqe { #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ +#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ #define IORING_OP_NOP 0 #define IORING_OP_READV 1 -- cgit v1.2.3 From a41525ab2e75987e809926352ebc6f1397da900e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Oct 2019 16:48:15 -0600 Subject: io_uring: add support for absolute timeouts This is a pretty trivial addition on top of the relative timeouts we have now, but it's handy for ensuring tighter timing for those that are building scheduling primitives on top of io_uring. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e0137ea6ad79..b402dfee5e15 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -70,6 +70,11 @@ struct io_uring_sqe { */ #define IORING_FSYNC_DATASYNC (1U << 0) +/* + * sqe->timeout_flags + */ +#define IORING_TIMEOUT_ABS (1U << 0) + /* * IO completion data structure (Completion Queue Entry) */ -- cgit v1.2.3 From 11365043e5271fea4c92189a976833da477a3a44 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Oct 2019 09:08:32 -0600 Subject: io_uring: add support for canceling timeout requests We might have cases where the need for a specific timeout is gone, add support for canceling an existing timeout operation. This works like the POLL_REMOVE command, where the application passes in the user_data of the timeout it wishes to cancel in the sqe->addr field. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b402dfee5e15..6dc5ced1c37a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -64,6 +64,7 @@ struct io_uring_sqe { #define IORING_OP_SENDMSG 9 #define IORING_OP_RECVMSG 10 #define IORING_OP_TIMEOUT 11 +#define IORING_OP_TIMEOUT_REMOVE 12 /* * sqe->fsync_flags -- cgit v1.2.3 From c826bd7a743f275e2b68c16d595534063b400deb Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Tue, 15 Oct 2019 19:02:01 +0200 Subject: io_uring: add set of tracing events To trace io_uring activity one can get an information from workqueue and io trace events, but looks like some parts could be hard to identify via this approach. Making what happens inside io_uring more transparent is important to be able to reason about many aspects of it, hence introduce the set of tracing events. All such events could be roughly divided into two categories: * those, that are helping to understand correctness (from both kernel and an application point of view). E.g. a ring creation, file registration, or waiting for available CQE. Proposed approach is to get a pointer to an original structure of interest (ring context, or request), and then find relevant events. io_uring_queue_async_work also exposes a pointer to work_struct, to be able to track down corresponding workqueue events. * those, that provide performance related information. Mostly it's about events that change the flow of requests, e.g. whether an async work was queued, or delayed due to some dependencies. Another important case is how io_uring optimizations (e.g. registered files) are utilized. Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Signed-off-by: Jens Axboe --- include/Kbuild | 1 + include/trace/events/io_uring.h | 349 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 350 insertions(+) create mode 100644 include/trace/events/io_uring.h (limited to 'include') diff --git a/include/Kbuild b/include/Kbuild index ffba79483cc5..61b66725d259 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -1028,6 +1028,7 @@ header-test- += trace/events/fsi_master_gpio.h header-test- += trace/events/huge_memory.h header-test- += trace/events/ib_mad.h header-test- += trace/events/ib_umad.h +header-test- += trace/events/io_uring.h header-test- += trace/events/iscsi.h header-test- += trace/events/jbd2.h header-test- += trace/events/kvm.h diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h new file mode 100644 index 000000000000..c5a905fbf1da --- /dev/null +++ b/include/trace/events/io_uring.h @@ -0,0 +1,349 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM io_uring + +#if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IO_URING_H + +#include + +/** + * io_uring_create - called after a new io_uring context was prepared + * + * @fd: corresponding file descriptor + * @ctx: pointer to a ring context structure + * @sq_entries: actual SQ size + * @cq_entries: actual CQ size + * @flags: SQ ring flags, provided to io_uring_setup(2) + * + * Allows to trace io_uring creation and provide pointer to a context, that can + * be used later to find correlated events. + */ +TRACE_EVENT(io_uring_create, + + TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags), + + TP_ARGS(fd, ctx, sq_entries, cq_entries, flags), + + TP_STRUCT__entry ( + __field( int, fd ) + __field( void *, ctx ) + __field( u32, sq_entries ) + __field( u32, cq_entries ) + __field( u32, flags ) + ), + + TP_fast_assign( + __entry->fd = fd; + __entry->ctx = ctx; + __entry->sq_entries = sq_entries; + __entry->cq_entries = cq_entries; + __entry->flags = flags; + ), + + TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d", + __entry->ctx, __entry->fd, __entry->sq_entries, + __entry->cq_entries, __entry->flags) +); + +/** + * io_uring_register - called after a buffer/file/eventfd was succesfully + * registered for a ring + * + * @ctx: pointer to a ring context structure + * @opcode: describes which operation to perform + * @nr_user_files: number of registered files + * @nr_user_bufs: number of registered buffers + * @cq_ev_fd: whether eventfs registered or not + * @ret: return code + * + * Allows to trace fixed files/buffers/eventfds, that could be registered to + * avoid an overhead of getting references to them for every operation. This + * event, together with io_uring_file_get, can provide a full picture of how + * much overhead one can reduce via fixing. + */ +TRACE_EVENT(io_uring_register, + + TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files, + unsigned nr_bufs, bool eventfd, long ret), + + TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( unsigned, opcode ) + __field( unsigned, nr_files ) + __field( unsigned, nr_bufs ) + __field( bool, eventfd ) + __field( long, ret ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->opcode = opcode; + __entry->nr_files = nr_files; + __entry->nr_bufs = nr_bufs; + __entry->eventfd = eventfd; + __entry->ret = ret; + ), + + TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, " + "eventfd %d, ret %ld", + __entry->ctx, __entry->opcode, __entry->nr_files, + __entry->nr_bufs, __entry->eventfd, __entry->ret) +); + +/** + * io_uring_file_get - called before getting references to an SQE file + * + * @ctx: pointer to a ring context structure + * @fd: SQE file descriptor + * + * Allows to trace out how often an SQE file reference is obtained, which can + * help figuring out if it makes sense to use fixed files, or check that fixed + * files are used correctly. + */ +TRACE_EVENT(io_uring_file_get, + + TP_PROTO(void *ctx, int fd), + + TP_ARGS(ctx, fd), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( int, fd ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->fd = fd; + ), + + TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd) +); + +/** + * io_uring_queue_async_work - called before submitting a new async work + * + * @ctx: pointer to a ring context structure + * @rw: type of workqueue, normal or buffered writes + * @req: pointer to a submitted request + * @work: pointer to a submitted work_struct + * + * Allows to trace asynchronous work submission. + */ +TRACE_EVENT(io_uring_queue_async_work, + + TP_PROTO(void *ctx, int rw, void * req, struct work_struct *work, + unsigned int flags), + + TP_ARGS(ctx, rw, req, work, flags), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( int, rw ) + __field( void *, req ) + __field( struct work_struct *, work ) + __field( unsigned int, flags ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->rw = rw; + __entry->req = req; + __entry->work = work; + __entry->flags = flags; + ), + + TP_printk("ring %p, request %p, flags %d, %s queue, work %p", + __entry->ctx, __entry->req, __entry->flags, + __entry->rw ? "buffered" : "normal", __entry->work) +); + +/** + * io_uring_defer_list - called before the io_uring work added into defer_list + * + * @ctx: pointer to a ring context structure + * @req: pointer to a deferred request + * @shadow: whether request is shadow or not + * + * Allows to track deferred requests, to get an insight about what requests are + * not started immediately. + */ +TRACE_EVENT(io_uring_defer, + + TP_PROTO(void *ctx, void *req, bool shadow), + + TP_ARGS(ctx, req, shadow), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( void *, req ) + __field( bool, shadow ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->req = req; + __entry->shadow = shadow; + ), + + TP_printk("ring %p, request %p%s", __entry->ctx, __entry->req, + __entry->shadow ? ", shadow": "") +); + +/** + * io_uring_link - called before the io_uring request added into link_list of + * another request + * + * @ctx: pointer to a ring context structure + * @req: pointer to a linked request + * @target_req: pointer to a previous request, that would contain @req + * + * Allows to track linked requests, to understand dependencies between requests + * and how does it influence their execution flow. + */ +TRACE_EVENT(io_uring_link, + + TP_PROTO(void *ctx, void *req, void *target_req), + + TP_ARGS(ctx, req, target_req), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( void *, req ) + __field( void *, target_req ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->req = req; + __entry->target_req = target_req; + ), + + TP_printk("ring %p, request %p linked after %p", + __entry->ctx, __entry->req, __entry->target_req) +); + +/** + * io_uring_add_to_prev - called after a request was added into a previously + * submitted work + * + * @req: pointer to a request, added to a previous + * @ret: whether or not it was completed successfully + * + * Allows to track merged work, to figure out how often requests are piggy + * backed into other ones, changing the execution flow. + */ +TRACE_EVENT(io_uring_add_to_prev, + + TP_PROTO(void *req, bool ret), + + TP_ARGS(req, ret), + + TP_STRUCT__entry ( + __field( void *, req ) + __field( bool, ret ) + ), + + TP_fast_assign( + __entry->req = req; + __entry->ret = ret; + ), + + TP_printk("request %p, ret %d", __entry->req, __entry->ret) +); + +/** + * io_uring_cqring_wait - called before start waiting for an available CQE + * + * @ctx: pointer to a ring context structure + * @min_events: minimal number of events to wait for + * + * Allows to track waiting for CQE, so that we can e.g. troubleshoot + * situations, when an application wants to wait for an event, that never + * comes. + */ +TRACE_EVENT(io_uring_cqring_wait, + + TP_PROTO(void *ctx, int min_events), + + TP_ARGS(ctx, min_events), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( int, min_events ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->min_events = min_events; + ), + + TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events) +); + +/** + * io_uring_fail_link - called before failing a linked request + * + * @req: request, which links were cancelled + * @link: cancelled link + * + * Allows to track linked requests cancellation, to see not only that some work + * was cancelled, but also which request was the reason. + */ +TRACE_EVENT(io_uring_fail_link, + + TP_PROTO(void *req, void *link), + + TP_ARGS(req, link), + + TP_STRUCT__entry ( + __field( void *, req ) + __field( void *, link ) + ), + + TP_fast_assign( + __entry->req = req; + __entry->link = link; + ), + + TP_printk("request %p, link %p", __entry->req, __entry->link) +); + +/** + * io_uring_submit_sqe - called before submitting one SQE + * + * @ctx: pointer to a ring context structure + * @force_nonblock: whether a context blocking or not + * @sq_thread: true if sq_thread has submitted this SQE + * + * Allows to track SQE submitting, to understand what was the source of it, SQ + * thread or io_uring_enter call. + */ +TRACE_EVENT(io_uring_submit_sqe, + + TP_PROTO(void *ctx, bool force_nonblock, bool sq_thread), + + TP_ARGS(ctx, force_nonblock, sq_thread), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( bool, force_nonblock ) + __field( bool, sq_thread ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->force_nonblock = force_nonblock; + __entry->sq_thread = sq_thread; + ), + + TP_printk("ring %p, non block %d, sq_thread %d", + __entry->ctx, __entry->force_nonblock, __entry->sq_thread) +); + +#endif /* _TRACE_IO_URING_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 771b53d033e8663abdf59704806aa856b236dcdb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 22 Oct 2019 10:25:58 -0600 Subject: io-wq: small threadpool implementation for io_uring This adds support for io-wq, a smaller and specialized thread pool implementation. This is meant to replace workqueues for io_uring. Among the reasons for this addition are: - We can assign memory context smarter and more persistently if we manage the life time of threads. - We can drop various work-arounds we have in io_uring, like the async_list. - We can implement hashed work insertion, to manage concurrency of buffered writes without needing a) an extra workqueue, or b) needlessly making the concurrency of said workqueue very low which hurts performance of multiple buffered file writers. - We can implement cancel through signals, for cancelling interruptible work like read/write (or send/recv) to/from sockets. - We need the above cancel for being able to assign and use file tables from a process. - We can implement a more thorough cancel operation in general. - We need it to move towards a syslet/threadlet model for even faster async execution. For that we need to take ownership of the used threads. This list is just off the top of my head. Performance should be the same, or better, at least that's what I've seen in my testing. io-wq supports basic NUMA functionality, setting up a pool per node. io-wq hooks up to the scheduler schedule in/out just like workqueue and uses that to drive the need for more/less workers. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Jens Axboe --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 67a1d86981a9..6666e25606b7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1468,6 +1468,7 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ +#define PF_IO_WORKER 0x20000000 /* Task is an IO worker */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ -- cgit v1.2.3 From 561fb04a6a2257716738dac2ed812f377c2634c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Oct 2019 07:25:42 -0600 Subject: io_uring: replace workqueue usage with io-wq Drop various work-arounds we have for workqueues: - We no longer need the async_list for tracking sequential IO. - We don't have to maintain our own mm tracking/setting. - We don't need a separate workqueue for buffered writes. This didn't even work that well to begin with, as it was suboptimal for multiple buffered writers on multiple files. - We can properly cancel pending interruptible work. This fixes deadlocks with particularly socket IO, where we cannot cancel them when the io_uring is closed. Hence the ring will wait forever for these requests to complete, which may never happen. This is different from disk IO where we know requests will complete in a finite amount of time. - Due to being able to cancel work interruptible work that is already running, we can implement file table support for work. We need that for supporting system calls that add to a process file table. - It gets us one step closer to adding async support for any system call. Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index c5a905fbf1da..b85255121b98 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -7,6 +7,8 @@ #include +struct io_wq_work; + /** * io_uring_create - called after a new io_uring context was prepared * @@ -126,15 +128,15 @@ TRACE_EVENT(io_uring_file_get, * io_uring_queue_async_work - called before submitting a new async work * * @ctx: pointer to a ring context structure - * @rw: type of workqueue, normal or buffered writes + * @hashed: type of workqueue, hashed or normal * @req: pointer to a submitted request - * @work: pointer to a submitted work_struct + * @work: pointer to a submitted io_wq_work * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, - TP_PROTO(void *ctx, int rw, void * req, struct work_struct *work, + TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work, unsigned int flags), TP_ARGS(ctx, rw, req, work, flags), @@ -143,7 +145,7 @@ TRACE_EVENT(io_uring_queue_async_work, __field( void *, ctx ) __field( int, rw ) __field( void *, req ) - __field( struct work_struct *, work ) + __field( struct io_wq_work *, work ) __field( unsigned int, flags ) ), @@ -157,7 +159,7 @@ TRACE_EVENT(io_uring_queue_async_work, TP_printk("ring %p, request %p, flags %d, %s queue, work %p", __entry->ctx, __entry->req, __entry->flags, - __entry->rw ? "buffered" : "normal", __entry->work) + __entry->rw ? "hashed" : "normal", __entry->work) ); /** -- cgit v1.2.3 From de2ea4b64b75a79ed9cdf9bf30e0e197901084e4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Oct 2019 14:41:29 -0600 Subject: net: add __sys_accept4_file() helper This is identical to __sys_accept4(), except it takes a struct file instead of an fd, and it also allows passing in extra file->f_flags flags. The latter is done to support masking in O_NONBLOCK without manipulating the original file flags. No functional changes in this patch. Cc: netdev@vger.kernel.org Acked-by: David S. Miller Signed-off-by: Jens Axboe --- include/linux/socket.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index fc0bed59fc84..dd061f741bc1 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -392,6 +392,9 @@ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, extern int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len); +extern int __sys_accept4_file(struct file *file, unsigned file_flags, + struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); -- cgit v1.2.3 From 17f2fe35d080d8f64e86a60cdcd3a97edcbc213b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Oct 2019 14:42:58 -0600 Subject: io_uring: add support for IORING_OP_ACCEPT This allows an application to call accept4() in an async fashion. Like other opcodes, we first try a non-blocking accept, then punt to async context if we have to. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6dc5ced1c37a..f82d90e617a6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -19,7 +19,10 @@ struct io_uring_sqe { __u8 flags; /* IOSQE_ flags */ __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ - __u64 off; /* offset into file */ + union { + __u64 off; /* offset into file */ + __u64 addr2; + }; __u64 addr; /* pointer to buffer or iovecs */ __u32 len; /* buffer size or number of iovecs */ union { @@ -29,6 +32,7 @@ struct io_uring_sqe { __u32 sync_range_flags; __u32 msg_flags; __u32 timeout_flags; + __u32 accept_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -65,6 +69,7 @@ struct io_uring_sqe { #define IORING_OP_RECVMSG 10 #define IORING_OP_TIMEOUT 11 #define IORING_OP_TIMEOUT_REMOVE 12 +#define IORING_OP_ACCEPT 13 /* * sqe->fsync_flags -- cgit v1.2.3 From d607525bd912860aad137326a1076d1e9880ddf0 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 25 Oct 2019 14:48:53 -0400 Subject: net: dsa: return directly from dsa_to_port Return directly from within the loop as soon as the port is found, otherwise we won't return NULL if the end of the list is reached. Fixes: b96ddf254b09 ("net: dsa: use ports list in dsa_to_port") Signed-off-by: Vivien Didelot Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index d5f6e5ccca38..9aba326abb64 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -283,13 +283,13 @@ struct dsa_switch { static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { struct dsa_switch_tree *dst = ds->dst; - struct dsa_port *dp = NULL; + struct dsa_port *dp; list_for_each_entry(dp, &dst->ports, list) if (dp->ds == ds && dp->index == p) - break; + return dp; - return dp; + return NULL; } static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) -- cgit v1.2.3 From 8466a57dfbb0c9bf6db4685ed9c4144b8deec688 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 29 Oct 2019 12:43:46 +0100 Subject: net/smc: remove unneeded include for smc.h The only smc-related reference in net/sock.h is struct smc_hashinfo. But just its address is refered to. Thus there is no need for the include of net/smc.h. Remove it. Suggested-by: Jakub Kicinski Reviewed by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- include/net/sock.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 380312cc67a9..09c26a5ecbff 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -66,7 +66,6 @@ #include #include #include -#include #include /* -- cgit v1.2.3 From c0bceb97db9efc72629dd00cd0d9812f24d4ba2d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 30 Oct 2019 14:00:41 +0100 Subject: tipc: add smart nagle feature We introduce a feature that works like a combination of TCP_NAGLE and TCP_CORK, but without some of the weaknesses of those. In particular, we will not observe long delivery delays because of delayed acks, since the algorithm itself decides if and when acks are to be sent from the receiving peer. - The nagle property as such is determined by manipulating a new 'maxnagle' field in struct tipc_sock. If certain conditions are met, 'maxnagle' will define max size of the messages which can be bundled. If it is set to zero no messages are ever bundled, implying that the nagle property is disabled. - A socket with the nagle property enabled enters nagle mode when more than 4 messages have been sent out without receiving any data message from the peer. - A socket leaves nagle mode whenever it receives a data message from the peer. In nagle mode, messages smaller than 'maxnagle' are accumulated in the socket write queue. The last buffer in the queue is marked with a new 'ack_required' bit, which forces the receiving peer to send a CONN_ACK message back to the sender upon reception. The accumulated contents of the write queue is transmitted when one of the following events or conditions occur. - A CONN_ACK message is received from the peer. - A data message is received from the peer. - A SOCK_WAKEUP pseudo message is received from the link level. - The write queue contains more than 64 1k blocks of data. - The connection is being shut down. - There is no CONN_ACK message to expect. I.e., there is currently no outstanding message where the 'ack_required' bit was set. As a consequence, the first message added after we enter nagle mode is always sent directly with this bit set. This new feature gives a 50-100% improvement of throughput for small (i.e., less than MTU size) messages, while it might add up to one RTT to latency time when the socket is in nagle mode. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 7df026ea6aff..76421b878767 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -191,6 +191,7 @@ struct sockaddr_tipc { #define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ #define TIPC_GROUP_LEAVE 136 /* No argument */ #define TIPC_SOCK_RECVQ_USED 137 /* Default: none (read only) */ +#define TIPC_NODELAY 138 /* Default: false */ /* * Flag values -- cgit v1.2.3 From 98298e6ca6d5908f96e529e70a254a4d5bf754e7 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 29 Oct 2019 14:50:50 +0100 Subject: flow_dissector: add meaningful comments Documents two piece of code which can't be understood at a glance. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 90bd210be060..7747af3cc500 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -282,6 +282,7 @@ struct flow_keys { struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; + /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; -- cgit v1.2.3 From 5dec597e5cd0f4c3000d120508efa64157d5bd7a Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 29 Oct 2019 14:50:52 +0100 Subject: flow_dissector: extract more ICMP information The ICMP flow dissector currently parses only the Type and Code fields. Some ICMP packets (echo, timestamp) have a 16 bit Identifier field which is used to correlate packets. Add such field in flow_dissector_key_icmp and replace skb_flow_get_be16() with a more complex function which populate this field. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 7747af3cc500..f8541d018848 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -6,6 +6,8 @@ #include #include +struct sk_buff; + /** * struct flow_dissector_key_control: * @thoff: Transport header offset @@ -156,19 +158,16 @@ struct flow_dissector_key_ports { /** * flow_dissector_key_icmp: - * @ports: type and code of ICMP header - * icmp: ICMP type (high) and code (low) * type: ICMP type * code: ICMP code + * id: session identifier */ struct flow_dissector_key_icmp { - union { - __be16 icmp; - struct { - u8 type; - u8 code; - }; + struct { + u8 type; + u8 code; }; + u16 id; }; /** @@ -282,6 +281,7 @@ struct flow_keys { struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; + struct flow_dissector_key_icmp icmp; /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; @@ -316,6 +316,9 @@ static inline bool flow_keys_have_l4(const struct flow_keys *keys) } u32 flow_hash_from_keys(struct flow_keys *keys); +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) -- cgit v1.2.3 From c8ecebd04cbb6badb46d42fe54282e7883ed63cc Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:00 +0200 Subject: net: sched: extract common action counters update code into function Currently, all implementations of tc_action_ops->stats_update() callback have almost exactly the same implementation of counters update code (besides gact which also updates drop counter). In order to simplify support for using both percpu-allocated and regular action counters depending on run-time flag in following patches, extract action counters update code into standalone function in act API. This commit doesn't change functionality. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index b18c699681ca..f6f66c692385 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -186,6 +186,8 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); +void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, + bool drop, bool hw); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); int tcf_action_check_ctrlact(int action, struct tcf_proto *tp, -- cgit v1.2.3 From 5e1ad95b630e652d3467d1fd1f0b5e5ea2c441e2 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:01 +0200 Subject: net: sched: extract bstats update code into function Extract common code that increments cpu_bstats counter into standalone act API function. Change hardware offloaded actions that use percpu counter allocation to use the new function instead of incrementing cpu_bstats directly. This commit doesn't change functionality. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index f6f66c692385..9a32853f77f9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -186,6 +186,13 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); + +static inline void tcf_action_update_bstats(struct tc_action *a, + struct sk_buff *skb) +{ + bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); +} + void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, bool drop, bool hw); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); -- cgit v1.2.3 From 26b537a88ca5b7399c7ab0656e06dbd9da9513c1 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:02 +0200 Subject: net: sched: extract qstats update code into functions Extract common code that increments cpu_qstats counters into standalone act API functions. Change hardware offloaded actions that use percpu counter allocation to use the new functions instead of accessing cpu_qstats directly. This commit doesn't change functionality. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 9a32853f77f9..8d6861ce205b 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -193,6 +193,22 @@ static inline void tcf_action_update_bstats(struct tc_action *a, bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); } +static inline struct gnet_stats_queue * +tcf_action_get_qstats(struct tc_action *a) +{ + return this_cpu_ptr(a->cpu_qstats); +} + +static inline void tcf_action_inc_drop_qstats(struct tc_action *a) +{ + qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); +} + +static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) +{ + qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); +} + void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, bool drop, bool hw); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); -- cgit v1.2.3 From ef816f3c49c1c404ababc50e10d4cbe5109da678 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:03 +0200 Subject: net: sched: don't expose action qstats to skb_tc_reinsert() Previous commit introduced helper function for updating qstats and refactored set of actions to use the helpers, instead of modifying qstats directly. However, one of the affected action exposes its qstats to skb_tc_reinsert(), which then modifies it. Refactor skb_tc_reinsert() to return integer error code and don't increment overlimit qstats in case of error, and use the returned error code in tcf_mirred_act() to manually increment the overlimit counter with new helper function. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 637548d54b3e..a8b0a9a4c686 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1286,17 +1286,9 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq); -static inline void skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res) +static inline int skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res) { - struct gnet_stats_queue *stats = res->qstats; - int ret; - - if (res->ingress) - ret = netif_receive_skb(skb); - else - ret = dev_queue_xmit(skb); - if (ret && stats) - qstats_overlimit_inc(res->qstats); + return res->ingress ? netif_receive_skb(skb) : dev_queue_xmit(skb); } #endif -- cgit v1.2.3 From 5e174d5e73dfbfb2c4bc4804f58f2f2aa34c9281 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:04 +0200 Subject: net: sched: modify stats helper functions to support regular stats Modify stats update helper functions introduced in previous patches in this series to fallback to regular tc_action->tcfa_{b|q}stats if cpu stats are not allocated for the action argument. If regular non-percpu allocated counters are in use, then obtain action tcfa_lock while modifying them. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8d6861ce205b..a56477051dae 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -190,23 +190,35 @@ int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); static inline void tcf_action_update_bstats(struct tc_action *a, struct sk_buff *skb) { - bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); -} - -static inline struct gnet_stats_queue * -tcf_action_get_qstats(struct tc_action *a) -{ - return this_cpu_ptr(a->cpu_qstats); + if (likely(a->cpu_bstats)) { + bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); + return; + } + spin_lock(&a->tcfa_lock); + bstats_update(&a->tcfa_bstats, skb); + spin_unlock(&a->tcfa_lock); } static inline void tcf_action_inc_drop_qstats(struct tc_action *a) { - qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); + if (likely(a->cpu_qstats)) { + qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); + return; + } + spin_lock(&a->tcfa_lock); + qstats_drop_inc(&a->tcfa_qstats); + spin_unlock(&a->tcfa_lock); } static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) { - qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); + if (likely(a->cpu_qstats)) { + qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); + return; + } + spin_lock(&a->tcfa_lock); + qstats_overlimit_inc(&a->tcfa_qstats); + spin_unlock(&a->tcfa_lock); } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets, -- cgit v1.2.3 From abbb0d33632ce931ca9c814813ee131351f6b92f Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:05 +0200 Subject: net: sched: extend TCA_ACT space with TCA_ACT_FLAGS Extend TCA_ACT space with nla_bitfield32 flags. Add TCA_ACT_FLAGS_NO_PERCPU_STATS as the only allowed flag. Parse the flags in tcf_action_init_1() and pass resulting value as additional argument to a_o->init(). Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/uapi/linux/pkt_cls.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index a56477051dae..85e95c44c7f9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -94,7 +94,7 @@ struct tc_action_ops { int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, - struct netlink_ext_ack *extack); + u32 flags, struct netlink_ext_ack *extack); int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *, diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index a6aa466fac9e..c6ad22f76ede 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -16,9 +16,14 @@ enum { TCA_ACT_STATS, TCA_ACT_PAD, TCA_ACT_COOKIE, + TCA_ACT_FLAGS, __TCA_ACT_MAX }; +#define TCA_ACT_FLAGS_NO_PERCPU_STATS 1 /* Don't use percpu allocator for + * actions stats. + */ + #define TCA_ACT_MAX __TCA_ACT_MAX #define TCA_OLD_COMPAT (TCA_ACT_MAX+1) #define TCA_ACT_MAX_PRIO 32 -- cgit v1.2.3 From e38226786022d2d8e5876ab7bc37e82b0eb57e65 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:06 +0200 Subject: net: sched: update action implementations to support flags Extend struct tc_action with new "tcfa_flags" field. Set the field in tcf_idr_create() function and provide new helper tcf_idr_create_from_flags() that derives 'cpustats' boolean from flags value. Update individual hardware-offloaded actions init() to pass their "flags" argument to new helper in order to skip percpu stats allocation when user requested it through flags. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/act_api.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 85e95c44c7f9..0495bdc034d2 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -41,6 +41,7 @@ struct tc_action { struct gnet_stats_queue __percpu *cpu_qstats; struct tc_cookie __rcu *act_cookie; struct tcf_chain __rcu *goto_chain; + u32 tcfa_flags; }; #define tcf_index common.tcfa_index #define tcf_refcnt common.tcfa_refcnt @@ -154,7 +155,11 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index); int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, - int bind, bool cpustats); + int bind, bool cpustats, u32 flags); +int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, + struct nlattr *est, struct tc_action **a, + const struct tc_action_ops *ops, int bind, + u32 flags); void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); -- cgit v1.2.3 From f1b9509c2fb0ef4db8d22dac9aef8e856a5d81f6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Oct 2019 15:32:11 -0700 Subject: bpf: Replace prog_raw_tp+btf_id with prog_tracing The bpf program type raw_tp together with 'expected_attach_type' was the most appropriate api to indicate BTF-enabled raw_tp programs. But during development it became apparent that 'expected_attach_type' cannot be used and new 'attach_btf_id' field had to be introduced. Which means that the information is duplicated in two fields where one of them is ignored. Clean it up by introducing new program type where both 'expected_attach_type' and 'attach_btf_id' fields have specific meaning. In the future 'expected_attach_type' will be extended with other attach points that have similar semantics to raw_tp. This patch is replacing BTF-enabled BPF_PROG_TYPE_RAW_TRACEPOINT with prog_type = BPF_RPOG_TYPE_TRACING expected_attach_type = BPF_TRACE_RAW_TP attach_btf_id = btf_id of raw tracepoint inside the kernel Future patches will add expected_attach_type = BPF_TRACE_FENTRY or BPF_TRACE_FEXIT where programs have the same input context and the same helpers, but different attach points. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191030223212.953010-2-ast@kernel.org --- include/linux/bpf.h | 5 +++++ include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 2 ++ 3 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 171be30fe0ae..80158cff44bd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -373,6 +373,11 @@ enum bpf_cgroup_storage_type { #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX +/* The longest tracepoint has 12 args. + * See include/trace/bpf_probe.h + */ +#define MAX_BPF_FUNC_ARGS 12 + struct bpf_prog_stats { u64 cnt; u64 nsecs; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 36a9c2325176..de14872b01ba 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -26,6 +26,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4af8b0819a32..a6bf19dabaab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -173,6 +173,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, BPF_PROG_TYPE_CGROUP_SOCKOPT, + BPF_PROG_TYPE_TRACING, }; enum bpf_attach_type { @@ -199,6 +200,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP6_RECVMSG, BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, + BPF_TRACE_RAW_TP, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 246880958ac93989c97c73ae1e60b78b4c4c88c5 Mon Sep 17 00:00:00 2001 From: Vikas Gupta Date: Thu, 31 Oct 2019 15:38:50 +0530 Subject: firmware: broadcom: add OP-TEE based BNXT f/w manager This driver registers on TEE bus to interact with OP-TEE based BNXT firmware management modules Cc: Jakub Kicinski Reported-by: kbuild test robot Signed-off-by: Vikas Gupta Signed-off-by: Sheetal Tigadoli Signed-off-by: David S. Miller --- include/linux/firmware/broadcom/tee_bnxt_fw.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 include/linux/firmware/broadcom/tee_bnxt_fw.h (limited to 'include') diff --git a/include/linux/firmware/broadcom/tee_bnxt_fw.h b/include/linux/firmware/broadcom/tee_bnxt_fw.h new file mode 100644 index 000000000000..f24c82d6ef73 --- /dev/null +++ b/include/linux/firmware/broadcom/tee_bnxt_fw.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright 2019 Broadcom. + */ + +#ifndef _BROADCOM_TEE_BNXT_FW_H +#define _BROADCOM_TEE_BNXT_FW_H + +#include + +int tee_bnxt_fw_load(void); +int tee_bnxt_copy_coredump(void *buf, u32 offset, u32 size); + +#endif /* _BROADCOM_TEE_BNXT_FW_H */ -- cgit v1.2.3 From a2d00f3db73dc4f6f6afcc95c1db809ea9019306 Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Thu, 31 Oct 2019 16:37:58 +0200 Subject: soc: fsl: qbman: allow registering a device link for the portal user Introduce the API required to make sure that the devices that use the QMan portal are unbound when the portal is unbound. Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- include/soc/fsl/qman.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h index aa31c05a103a..c499c5cfa7c9 100644 --- a/include/soc/fsl/qman.h +++ b/include/soc/fsl/qman.h @@ -32,6 +32,7 @@ #define __FSL_QMAN_H #include +#include /* Hardware constants */ #define QM_CHANNEL_SWPORTAL0 0 @@ -914,6 +915,23 @@ u16 qman_affine_channel(int cpu); */ struct qman_portal *qman_get_affine_portal(int cpu); +/** + * qman_start_using_portal - register a device link for the portal user + * @p: the portal that will be in use + * @dev: the device that will use the portal + * + * Makes sure that the devices that use the portal are unbound when the + * portal is unbound + */ +int qman_start_using_portal(struct qman_portal *p, struct device *dev); + +/** + * qman_stop_using_portal - deregister a device link for the portal user + * @p: the portal that will no longer be in use + * @dev: the device that uses the portal + */ +void qman_stop_using_portal(struct qman_portal *p, struct device *dev); + /** * qman_p_poll_dqrr - process DQRR (fast-path) entries * @limit: the maximum number of DQRR entries to process -- cgit v1.2.3 From e06eea555b878f2c95b498aa1c485250ad30c960 Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Thu, 31 Oct 2019 16:37:59 +0200 Subject: dpaa_eth: register a device link for the qman portal used Before this change, unbinding the QMan portals did not trigger a corresponding unbinding of the dpaa_eth making use of it; the first QMan portal related operation issued afterwards crashed the kernel. The device link ensures the dpaa_eth dependency upon the qman portal used is honoured at the QMan portal removal. Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- include/soc/fsl/qman.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h index c499c5cfa7c9..cfe00e08e85b 100644 --- a/include/soc/fsl/qman.h +++ b/include/soc/fsl/qman.h @@ -925,13 +925,6 @@ struct qman_portal *qman_get_affine_portal(int cpu); */ int qman_start_using_portal(struct qman_portal *p, struct device *dev); -/** - * qman_stop_using_portal - deregister a device link for the portal user - * @p: the portal that will no longer be in use - * @dev: the device that uses the portal - */ -void qman_stop_using_portal(struct qman_portal *p, struct device *dev); - /** * qman_p_poll_dqrr - process DQRR (fast-path) entries * @limit: the maximum number of DQRR entries to process -- cgit v1.2.3 From 1ac210d128ef6e92698dd3aa4e2e03e831bc9906 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Thu, 31 Oct 2019 01:18:29 +0200 Subject: bus: fsl-mc: add the fsl_mc_get_endpoint function Using the newly added fsl_mc_get_endpoint function a fsl-mc driver can find its associated endpoint (another object at the other link of a MC firmware link). The API will be used in the following patch in order to discover the connected DPMAC object of a DPNI. Also, the fsl_mc_device_lookup function is made available to the entire fsl-mc bus driver and not just for the dprc driver. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- include/linux/fsl/mc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 975553a9f75d..54d9436600c7 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -403,6 +403,8 @@ int __must_check fsl_mc_allocate_irqs(struct fsl_mc_device *mc_dev); void fsl_mc_free_irqs(struct fsl_mc_device *mc_dev); +struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev); + extern struct bus_type fsl_mc_bus_type; extern struct device_type fsl_mc_bus_dprc_type; -- cgit v1.2.3 From c5f51765a1f60b701840544faf3ca63204b8dc3c Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 30 Oct 2019 22:09:13 -0400 Subject: net: dsa: list DSA links in the fabric Implement a new list of DSA links in the switch fabric itself, to provide an alterative to the ds->rtable static arrays. At the same time, provide a new dsa_routing_port() helper to abstract the usage of ds->rtable in drivers. If there's no port to reach a given device, return the first invalid port, ds->num_ports. This avoids potential signedness errors or the need to define special values. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9aba326abb64..3d7366d634d8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -123,6 +123,9 @@ struct dsa_switch_tree { /* List of switch ports */ struct list_head ports; + /* List of DSA links composing the routing table */ + struct list_head rtable; + /* * Data for the individual switch chips. */ @@ -214,6 +217,17 @@ struct dsa_port { bool setup; }; +/* TODO: ideally DSA ports would have a single dp->link_dp member, + * and no dst->rtable nor this struct dsa_link would be needed, + * but this would require some more complex tree walking, + * so keep it stupid at the moment and list them all. + */ +struct dsa_link { + struct dsa_port *dp; + struct dsa_port *link_dp; + struct list_head list; +}; + struct dsa_switch { bool setup; @@ -324,6 +338,19 @@ static inline u32 dsa_user_ports(struct dsa_switch *ds) return mask; } +/* Return the local port used to reach an arbitrary switch device */ +static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device) +{ + struct dsa_switch_tree *dst = ds->dst; + struct dsa_link *dl; + + list_for_each_entry(dl, &dst->rtable, list) + if (dl->dp->ds == ds && dl->link_dp->ds->index == device) + return dl->dp->index; + + return ds->num_ports; +} + /* Return the local port used to reach an arbitrary switch port */ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, int port) @@ -331,7 +358,7 @@ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, if (device == ds->index) return port; else - return ds->rtable[device]; + return dsa_routing_port(ds, device); } /* Return the local port used to reach the dedicated CPU port */ -- cgit v1.2.3 From 96252b8e05326df072cd321159878aa4725c5bd4 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 30 Oct 2019 22:09:14 -0400 Subject: net: dsa: remove ds->rtable Drivers do not use the ds->rtable static arrays anymore, get rid of it. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 3d7366d634d8..b46222adb5c2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -258,13 +258,6 @@ struct dsa_switch { */ const struct dsa_switch_ops *ops; - /* - * An array of which element [a] indicates which port on this - * switch should be used to send packets to that are destined - * for switch a. Can be NULL if there is only one switch chip. - */ - s8 rtable[DSA_MAX_SWITCHES]; - /* * Slave mii_bus and devices for the individual ports. */ -- cgit v1.2.3 From 9c8ad1ab66b577526a4c89e4a222e0fac431a2d6 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 30 Oct 2019 22:09:16 -0400 Subject: net: dsa: remove the dst->ds array Now that the DSA ports are listed in the switch fabric, there is no need to store the dsa_switch structures from the drivers in the fabric anymore. So get rid of the dst->ds static array. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index b46222adb5c2..e4c697b95c70 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -125,11 +125,6 @@ struct dsa_switch_tree { /* List of DSA links composing the routing table */ struct list_head rtable; - - /* - * Data for the individual switch chips. - */ - struct dsa_switch *ds[DSA_MAX_SWITCHES]; }; /* TC matchall action types, only mirroring for now */ -- cgit v1.2.3 From 268a2d60013049cfd9a0aada77284aa6ea8ad26a Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Sun, 20 Oct 2019 22:43:13 +0800 Subject: MIPS: Loongson64: Rename CPU TYPES CPU_LOONGSON2 -> CPU_LOONGSON2EF CPU_LOONGSON3 -> CPU_LOONGSON64 As newer loongson-2 products (2G/2H/2K1000) can share kernel implementation with loongson-3 while 2E/2F are less similar with other LOONGSON64 products. Signed-off-by: Jiaxun Yang Signed-off-by: Paul Burton Cc: linux-mips@vger.kernel.org Cc: chenhc@lemote.com Cc: paul.burton@mips.com --- include/drm/drm_cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/drm_cache.h b/include/drm/drm_cache.h index 987ff16b9420..e9ad4863d915 100644 --- a/include/drm/drm_cache.h +++ b/include/drm/drm_cache.h @@ -45,7 +45,7 @@ static inline bool drm_arch_can_wc_memory(void) { #if defined(CONFIG_PPC) && !defined(CONFIG_NOT_COHERENT_CACHE) return false; -#elif defined(CONFIG_MIPS) && defined(CONFIG_CPU_LOONGSON3) +#elif defined(CONFIG_MIPS) && defined(CONFIG_CPU_LOONGSON64) return false; #elif defined(CONFIG_ARM) || defined(CONFIG_ARM64) /* -- cgit v1.2.3 From d3ca75a8b3d77f2788e6c119ea7c3e3a1ab1e1ca Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 25 Oct 2019 12:41:09 -0700 Subject: crypto: skcipher - unify the crypto_has_skcipher*() functions crypto_has_skcipher() and crypto_has_skcipher2() do the same thing: they check for the availability of an algorithm of type skcipher, blkcipher, or ablkcipher, which also meets any non-type constraints the caller specified. And they have exactly the same prototype. Therefore, eliminate the redundancy by removing crypto_has_skcipher() and renaming crypto_has_skcipher2() to crypto_has_skcipher(). Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- include/crypto/skcipher.h | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'include') diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index aada87916918..e34993f5d190 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -218,30 +218,13 @@ static inline void crypto_free_sync_skcipher(struct crypto_sync_skcipher *tfm) * crypto_has_skcipher() - Search for the availability of an skcipher. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * skcipher - * @type: specifies the type of the cipher - * @mask: specifies the mask for the cipher - * - * Return: true when the skcipher is known to the kernel crypto API; false - * otherwise - */ -static inline int crypto_has_skcipher(const char *alg_name, u32 type, - u32 mask) -{ - return crypto_has_alg(alg_name, crypto_skcipher_type(type), - crypto_skcipher_mask(mask)); -} - -/** - * crypto_has_skcipher2() - Search for the availability of an skcipher. - * @alg_name: is the cra_name / name or cra_driver_name / driver name of the - * skcipher * @type: specifies the type of the skcipher * @mask: specifies the mask for the skcipher * * Return: true when the skcipher is known to the kernel crypto API; false * otherwise */ -int crypto_has_skcipher2(const char *alg_name, u32 type, u32 mask); +int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask); static inline const char *crypto_skcipher_driver_name( struct crypto_skcipher *tfm) -- cgit v1.2.3 From cec0cb8a28f9060367099beeafd0dbdb76fdfae2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 25 Oct 2019 12:41:10 -0700 Subject: crypto: skcipher - remove crypto_has_ablkcipher() crypto_has_ablkcipher() has no users, and it does the same thing as crypto_has_skcipher() anyway. So remove it. This also removes the last user of crypto_skcipher_type() and crypto_skcipher_mask(), so remove those too. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- include/linux/crypto.h | 31 ------------------------------- 1 file changed, 31 deletions(-) (limited to 'include') diff --git a/include/linux/crypto.h b/include/linux/crypto.h index 19ea3a371d7b..b7855743f7e3 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -900,20 +900,6 @@ static inline struct crypto_ablkcipher *__crypto_ablkcipher_cast( return (struct crypto_ablkcipher *)tfm; } -static inline u32 crypto_skcipher_type(u32 type) -{ - type &= ~CRYPTO_ALG_TYPE_MASK; - type |= CRYPTO_ALG_TYPE_BLKCIPHER; - return type; -} - -static inline u32 crypto_skcipher_mask(u32 mask) -{ - mask &= ~CRYPTO_ALG_TYPE_MASK; - mask |= CRYPTO_ALG_TYPE_BLKCIPHER_MASK; - return mask; -} - /** * DOC: Asynchronous Block Cipher API * @@ -959,23 +945,6 @@ static inline void crypto_free_ablkcipher(struct crypto_ablkcipher *tfm) crypto_free_tfm(crypto_ablkcipher_tfm(tfm)); } -/** - * crypto_has_ablkcipher() - Search for the availability of an ablkcipher. - * @alg_name: is the cra_name / name or cra_driver_name / driver name of the - * ablkcipher - * @type: specifies the type of the cipher - * @mask: specifies the mask for the cipher - * - * Return: true when the ablkcipher is known to the kernel crypto API; false - * otherwise - */ -static inline int crypto_has_ablkcipher(const char *alg_name, u32 type, - u32 mask) -{ - return crypto_has_alg(alg_name, crypto_skcipher_type(type), - crypto_skcipher_mask(mask)); -} - static inline struct ablkcipher_tfm *crypto_ablkcipher_crt( struct crypto_ablkcipher *tfm) { -- cgit v1.2.3 From c65058b7587fd3d001c57a50285477be521f5350 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 25 Oct 2019 12:41:12 -0700 Subject: crypto: skcipher - remove the "blkcipher" algorithm type Now that all "blkcipher" algorithms have been converted to "skcipher", remove the blkcipher algorithm type. The skcipher (symmetric key cipher) algorithm type was introduced a few years ago to replace both blkcipher and ablkcipher (synchronous and asynchronous block cipher). The advantages of skcipher include: - A much less confusing name, since none of these algorithm types have ever actually been for raw block ciphers, but rather for all length-preserving encryption modes including block cipher modes of operation, stream ciphers, and other length-preserving modes. - It unified blkcipher and ablkcipher into a single algorithm type which supports both synchronous and asynchronous implementations. Note, blkcipher already operated only on scatterlists, so the fact that skcipher does too isn't a regression in functionality. - Better type safety by using struct skcipher_alg, struct crypto_skcipher, etc. instead of crypto_alg, crypto_tfm, etc. - It sometimes simplifies the implementations of algorithms. Also, the blkcipher API was no longer being tested. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- include/crypto/algapi.h | 74 ------- include/crypto/internal/skcipher.h | 12 -- include/crypto/skcipher.h | 8 - include/linux/crypto.h | 395 +------------------------------------ 4 files changed, 2 insertions(+), 487 deletions(-) (limited to 'include') diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index e5bd302f2c49..cadc5257c612 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -85,36 +85,6 @@ struct scatter_walk { unsigned int offset; }; -struct blkcipher_walk { - union { - struct { - struct page *page; - unsigned long offset; - } phys; - - struct { - u8 *page; - u8 *addr; - } virt; - } src, dst; - - struct scatter_walk in; - unsigned int nbytes; - - struct scatter_walk out; - unsigned int total; - - void *page; - u8 *buffer; - u8 *iv; - unsigned int ivsize; - - int flags; - unsigned int walk_blocksize; - unsigned int cipher_blocksize; - unsigned int alignmask; -}; - struct ablkcipher_walk { struct { struct page *page; @@ -133,7 +103,6 @@ struct ablkcipher_walk { }; extern const struct crypto_type crypto_ablkcipher_type; -extern const struct crypto_type crypto_blkcipher_type; void crypto_mod_put(struct crypto_alg *alg); @@ -233,20 +202,6 @@ static inline void crypto_xor_cpy(u8 *dst, const u8 *src1, const u8 *src2, } } -int blkcipher_walk_done(struct blkcipher_desc *desc, - struct blkcipher_walk *walk, int err); -int blkcipher_walk_virt(struct blkcipher_desc *desc, - struct blkcipher_walk *walk); -int blkcipher_walk_phys(struct blkcipher_desc *desc, - struct blkcipher_walk *walk); -int blkcipher_walk_virt_block(struct blkcipher_desc *desc, - struct blkcipher_walk *walk, - unsigned int blocksize); -int blkcipher_aead_walk_virt_block(struct blkcipher_desc *desc, - struct blkcipher_walk *walk, - struct crypto_aead *tfm, - unsigned int blocksize); - int ablkcipher_walk_done(struct ablkcipher_request *req, struct ablkcipher_walk *walk, int err); int ablkcipher_walk_phys(struct ablkcipher_request *req, @@ -286,25 +241,6 @@ static inline void *crypto_ablkcipher_ctx_aligned(struct crypto_ablkcipher *tfm) return crypto_tfm_ctx_aligned(&tfm->base); } -static inline struct crypto_blkcipher *crypto_spawn_blkcipher( - struct crypto_spawn *spawn) -{ - u32 type = CRYPTO_ALG_TYPE_BLKCIPHER; - u32 mask = CRYPTO_ALG_TYPE_MASK; - - return __crypto_blkcipher_cast(crypto_spawn_tfm(spawn, type, mask)); -} - -static inline void *crypto_blkcipher_ctx(struct crypto_blkcipher *tfm) -{ - return crypto_tfm_ctx(&tfm->base); -} - -static inline void *crypto_blkcipher_ctx_aligned(struct crypto_blkcipher *tfm) -{ - return crypto_tfm_ctx_aligned(&tfm->base); -} - static inline struct crypto_cipher *crypto_spawn_cipher( struct crypto_spawn *spawn) { @@ -319,16 +255,6 @@ static inline struct cipher_alg *crypto_cipher_alg(struct crypto_cipher *tfm) return &crypto_cipher_tfm(tfm)->__crt_alg->cra_cipher; } -static inline void blkcipher_walk_init(struct blkcipher_walk *walk, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes) -{ - walk->in.sg = src; - walk->out.sg = dst; - walk->total = nbytes; -} - static inline void ablkcipher_walk_init(struct ablkcipher_walk *walk, struct scatterlist *dst, struct scatterlist *src, diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index 3175dfeaed2c..454e898d5f5f 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -182,10 +182,6 @@ static inline u32 skcipher_request_flags(struct skcipher_request *req) static inline unsigned int crypto_skcipher_alg_min_keysize( struct skcipher_alg *alg) { - if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == - CRYPTO_ALG_TYPE_BLKCIPHER) - return alg->base.cra_blkcipher.min_keysize; - if (alg->base.cra_ablkcipher.encrypt) return alg->base.cra_ablkcipher.min_keysize; @@ -195,10 +191,6 @@ static inline unsigned int crypto_skcipher_alg_min_keysize( static inline unsigned int crypto_skcipher_alg_max_keysize( struct skcipher_alg *alg) { - if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == - CRYPTO_ALG_TYPE_BLKCIPHER) - return alg->base.cra_blkcipher.max_keysize; - if (alg->base.cra_ablkcipher.encrypt) return alg->base.cra_ablkcipher.max_keysize; @@ -208,10 +200,6 @@ static inline unsigned int crypto_skcipher_alg_max_keysize( static inline unsigned int crypto_skcipher_alg_walksize( struct skcipher_alg *alg) { - if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == - CRYPTO_ALG_TYPE_BLKCIPHER) - return alg->base.cra_blocksize; - if (alg->base.cra_ablkcipher.encrypt) return alg->base.cra_blocksize; diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index e34993f5d190..8c5a31e810da 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -241,10 +241,6 @@ static inline struct skcipher_alg *crypto_skcipher_alg( static inline unsigned int crypto_skcipher_alg_ivsize(struct skcipher_alg *alg) { - if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == - CRYPTO_ALG_TYPE_BLKCIPHER) - return alg->base.cra_blkcipher.ivsize; - if (alg->base.cra_ablkcipher.encrypt) return alg->base.cra_ablkcipher.ivsize; @@ -290,10 +286,6 @@ static inline unsigned int crypto_skcipher_blocksize( static inline unsigned int crypto_skcipher_alg_chunksize( struct skcipher_alg *alg) { - if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == - CRYPTO_ALG_TYPE_BLKCIPHER) - return alg->base.cra_blocksize; - if (alg->base.cra_ablkcipher.encrypt) return alg->base.cra_blocksize; diff --git a/include/linux/crypto.h b/include/linux/crypto.h index b7855743f7e3..e9f2c6b5d800 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -41,7 +41,6 @@ #define CRYPTO_ALG_TYPE_CIPHER 0x00000001 #define CRYPTO_ALG_TYPE_COMPRESS 0x00000002 #define CRYPTO_ALG_TYPE_AEAD 0x00000003 -#define CRYPTO_ALG_TYPE_BLKCIPHER 0x00000004 #define CRYPTO_ALG_TYPE_ABLKCIPHER 0x00000005 #define CRYPTO_ALG_TYPE_SKCIPHER 0x00000005 #define CRYPTO_ALG_TYPE_KPP 0x00000008 @@ -55,7 +54,6 @@ #define CRYPTO_ALG_TYPE_HASH_MASK 0x0000000e #define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000e -#define CRYPTO_ALG_TYPE_BLKCIPHER_MASK 0x0000000c #define CRYPTO_ALG_TYPE_ACOMPRESS_MASK 0x0000000e #define CRYPTO_ALG_LARVAL 0x00000010 @@ -141,7 +139,6 @@ struct scatterlist; struct crypto_ablkcipher; struct crypto_async_request; -struct crypto_blkcipher; struct crypto_tfm; struct crypto_type; @@ -176,12 +173,6 @@ struct ablkcipher_request { void *__ctx[] CRYPTO_MINALIGN_ATTR; }; -struct blkcipher_desc { - struct crypto_blkcipher *tfm; - void *info; - u32 flags; -}; - /** * DOC: Block Cipher Algorithm Definitions * @@ -240,32 +231,6 @@ struct ablkcipher_alg { unsigned int ivsize; }; -/** - * struct blkcipher_alg - synchronous block cipher definition - * @min_keysize: see struct ablkcipher_alg - * @max_keysize: see struct ablkcipher_alg - * @setkey: see struct ablkcipher_alg - * @encrypt: see struct ablkcipher_alg - * @decrypt: see struct ablkcipher_alg - * @ivsize: see struct ablkcipher_alg - * - * All fields except @ivsize are mandatory and must be filled. - */ -struct blkcipher_alg { - int (*setkey)(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen); - int (*encrypt)(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes); - int (*decrypt)(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes); - - unsigned int min_keysize; - unsigned int max_keysize; - unsigned int ivsize; -}; - /** * struct cipher_alg - single-block symmetric ciphers definition * @cia_min_keysize: Minimum key size supported by the transformation. This is @@ -451,7 +416,6 @@ struct crypto_istat_rng { #endif /* CONFIG_CRYPTO_STATS */ #define cra_ablkcipher cra_u.ablkcipher -#define cra_blkcipher cra_u.blkcipher #define cra_cipher cra_u.cipher #define cra_compress cra_u.compress @@ -499,9 +463,8 @@ struct crypto_istat_rng { * transformation algorithm. * @cra_type: Type of the cryptographic transformation. This is a pointer to * struct crypto_type, which implements callbacks common for all - * transformation types. There are multiple options: - * &crypto_blkcipher_type, &crypto_ablkcipher_type, - * &crypto_ahash_type, &crypto_rng_type. + * transformation types. There are multiple options, such as + * &crypto_skcipher_type, &crypto_ahash_type, &crypto_rng_type. * This field might be empty. In that case, there are no common * callbacks. This is the case for: cipher, compress, shash. * @cra_u: Callbacks implementing the transformation. This is a union of @@ -522,8 +485,6 @@ struct crypto_istat_rng { * @cra_init. * @cra_u.ablkcipher: Union member which contains an asynchronous block cipher * definition. See @struct @ablkcipher_alg. - * @cra_u.blkcipher: Union member which contains a synchronous block cipher - * definition See @struct @blkcipher_alg. * @cra_u.cipher: Union member which contains a single-block symmetric cipher * definition. See @struct @cipher_alg. * @cra_u.compress: Union member which contains a (de)compression algorithm. @@ -566,7 +527,6 @@ struct crypto_alg { union { struct ablkcipher_alg ablkcipher; - struct blkcipher_alg blkcipher; struct cipher_alg cipher; struct compress_alg compress; } cra_u; @@ -727,16 +687,6 @@ struct ablkcipher_tfm { unsigned int reqsize; }; -struct blkcipher_tfm { - void *iv; - int (*setkey)(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen); - int (*encrypt)(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes); - int (*decrypt)(struct blkcipher_desc *desc, struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes); -}; - struct cipher_tfm { int (*cit_setkey)(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen); @@ -754,7 +704,6 @@ struct compress_tfm { }; #define crt_ablkcipher crt_u.ablkcipher -#define crt_blkcipher crt_u.blkcipher #define crt_cipher crt_u.cipher #define crt_compress crt_u.compress @@ -764,7 +713,6 @@ struct crypto_tfm { union { struct ablkcipher_tfm ablkcipher; - struct blkcipher_tfm blkcipher; struct cipher_tfm cipher; struct compress_tfm compress; } crt_u; @@ -780,10 +728,6 @@ struct crypto_ablkcipher { struct crypto_tfm base; }; -struct crypto_blkcipher { - struct crypto_tfm base; -}; - struct crypto_cipher { struct crypto_tfm base; }; @@ -1232,341 +1176,6 @@ static inline void ablkcipher_request_set_crypt( req->info = iv; } -/** - * DOC: Synchronous Block Cipher API - * - * The synchronous block cipher API is used with the ciphers of type - * CRYPTO_ALG_TYPE_BLKCIPHER (listed as type "blkcipher" in /proc/crypto) - * - * Synchronous calls, have a context in the tfm. But since a single tfm can be - * used in multiple calls and in parallel, this info should not be changeable - * (unless a lock is used). This applies, for example, to the symmetric key. - * However, the IV is changeable, so there is an iv field in blkcipher_tfm - * structure for synchronous blkcipher api. So, its the only state info that can - * be kept for synchronous calls without using a big lock across a tfm. - * - * The block cipher API allows the use of a complete cipher, i.e. a cipher - * consisting of a template (a block chaining mode) and a single block cipher - * primitive (e.g. AES). - * - * The plaintext data buffer and the ciphertext data buffer are pointed to - * by using scatter/gather lists. The cipher operation is performed - * on all segments of the provided scatter/gather lists. - * - * The kernel crypto API supports a cipher operation "in-place" which means that - * the caller may provide the same scatter/gather list for the plaintext and - * cipher text. After the completion of the cipher operation, the plaintext - * data is replaced with the ciphertext data in case of an encryption and vice - * versa for a decryption. The caller must ensure that the scatter/gather lists - * for the output data point to sufficiently large buffers, i.e. multiples of - * the block size of the cipher. - */ - -static inline struct crypto_blkcipher *__crypto_blkcipher_cast( - struct crypto_tfm *tfm) -{ - return (struct crypto_blkcipher *)tfm; -} - -static inline struct crypto_blkcipher *crypto_blkcipher_cast( - struct crypto_tfm *tfm) -{ - BUG_ON(crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_BLKCIPHER); - return __crypto_blkcipher_cast(tfm); -} - -/** - * crypto_alloc_blkcipher() - allocate synchronous block cipher handle - * @alg_name: is the cra_name / name or cra_driver_name / driver name of the - * blkcipher cipher - * @type: specifies the type of the cipher - * @mask: specifies the mask for the cipher - * - * Allocate a cipher handle for a block cipher. The returned struct - * crypto_blkcipher is the cipher handle that is required for any subsequent - * API invocation for that block cipher. - * - * Return: allocated cipher handle in case of success; IS_ERR() is true in case - * of an error, PTR_ERR() returns the error code. - */ -static inline struct crypto_blkcipher *crypto_alloc_blkcipher( - const char *alg_name, u32 type, u32 mask) -{ - type &= ~CRYPTO_ALG_TYPE_MASK; - type |= CRYPTO_ALG_TYPE_BLKCIPHER; - mask |= CRYPTO_ALG_TYPE_MASK; - - return __crypto_blkcipher_cast(crypto_alloc_base(alg_name, type, mask)); -} - -static inline struct crypto_tfm *crypto_blkcipher_tfm( - struct crypto_blkcipher *tfm) -{ - return &tfm->base; -} - -/** - * crypto_free_blkcipher() - zeroize and free the block cipher handle - * @tfm: cipher handle to be freed - */ -static inline void crypto_free_blkcipher(struct crypto_blkcipher *tfm) -{ - crypto_free_tfm(crypto_blkcipher_tfm(tfm)); -} - -/** - * crypto_has_blkcipher() - Search for the availability of a block cipher - * @alg_name: is the cra_name / name or cra_driver_name / driver name of the - * block cipher - * @type: specifies the type of the cipher - * @mask: specifies the mask for the cipher - * - * Return: true when the block cipher is known to the kernel crypto API; false - * otherwise - */ -static inline int crypto_has_blkcipher(const char *alg_name, u32 type, u32 mask) -{ - type &= ~CRYPTO_ALG_TYPE_MASK; - type |= CRYPTO_ALG_TYPE_BLKCIPHER; - mask |= CRYPTO_ALG_TYPE_MASK; - - return crypto_has_alg(alg_name, type, mask); -} - -/** - * crypto_blkcipher_name() - return the name / cra_name from the cipher handle - * @tfm: cipher handle - * - * Return: The character string holding the name of the cipher - */ -static inline const char *crypto_blkcipher_name(struct crypto_blkcipher *tfm) -{ - return crypto_tfm_alg_name(crypto_blkcipher_tfm(tfm)); -} - -static inline struct blkcipher_tfm *crypto_blkcipher_crt( - struct crypto_blkcipher *tfm) -{ - return &crypto_blkcipher_tfm(tfm)->crt_blkcipher; -} - -static inline struct blkcipher_alg *crypto_blkcipher_alg( - struct crypto_blkcipher *tfm) -{ - return &crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher; -} - -/** - * crypto_blkcipher_ivsize() - obtain IV size - * @tfm: cipher handle - * - * The size of the IV for the block cipher referenced by the cipher handle is - * returned. This IV size may be zero if the cipher does not need an IV. - * - * Return: IV size in bytes - */ -static inline unsigned int crypto_blkcipher_ivsize(struct crypto_blkcipher *tfm) -{ - return crypto_blkcipher_alg(tfm)->ivsize; -} - -/** - * crypto_blkcipher_blocksize() - obtain block size of cipher - * @tfm: cipher handle - * - * The block size for the block cipher referenced with the cipher handle is - * returned. The caller may use that information to allocate appropriate - * memory for the data returned by the encryption or decryption operation. - * - * Return: block size of cipher - */ -static inline unsigned int crypto_blkcipher_blocksize( - struct crypto_blkcipher *tfm) -{ - return crypto_tfm_alg_blocksize(crypto_blkcipher_tfm(tfm)); -} - -static inline unsigned int crypto_blkcipher_alignmask( - struct crypto_blkcipher *tfm) -{ - return crypto_tfm_alg_alignmask(crypto_blkcipher_tfm(tfm)); -} - -static inline u32 crypto_blkcipher_get_flags(struct crypto_blkcipher *tfm) -{ - return crypto_tfm_get_flags(crypto_blkcipher_tfm(tfm)); -} - -static inline void crypto_blkcipher_set_flags(struct crypto_blkcipher *tfm, - u32 flags) -{ - crypto_tfm_set_flags(crypto_blkcipher_tfm(tfm), flags); -} - -static inline void crypto_blkcipher_clear_flags(struct crypto_blkcipher *tfm, - u32 flags) -{ - crypto_tfm_clear_flags(crypto_blkcipher_tfm(tfm), flags); -} - -/** - * crypto_blkcipher_setkey() - set key for cipher - * @tfm: cipher handle - * @key: buffer holding the key - * @keylen: length of the key in bytes - * - * The caller provided key is set for the block cipher referenced by the cipher - * handle. - * - * Note, the key length determines the cipher type. Many block ciphers implement - * different cipher modes depending on the key size, such as AES-128 vs AES-192 - * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 - * is performed. - * - * Return: 0 if the setting of the key was successful; < 0 if an error occurred - */ -static inline int crypto_blkcipher_setkey(struct crypto_blkcipher *tfm, - const u8 *key, unsigned int keylen) -{ - return crypto_blkcipher_crt(tfm)->setkey(crypto_blkcipher_tfm(tfm), - key, keylen); -} - -/** - * crypto_blkcipher_encrypt() - encrypt plaintext - * @desc: reference to the block cipher handle with meta data - * @dst: scatter/gather list that is filled by the cipher operation with the - * ciphertext - * @src: scatter/gather list that holds the plaintext - * @nbytes: number of bytes of the plaintext to encrypt. - * - * Encrypt plaintext data using the IV set by the caller with a preceding - * call of crypto_blkcipher_set_iv. - * - * The blkcipher_desc data structure must be filled by the caller and can - * reside on the stack. The caller must fill desc as follows: desc.tfm is filled - * with the block cipher handle; desc.flags is filled with either - * CRYPTO_TFM_REQ_MAY_SLEEP or 0. - * - * Return: 0 if the cipher operation was successful; < 0 if an error occurred - */ -static inline int crypto_blkcipher_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes) -{ - desc->info = crypto_blkcipher_crt(desc->tfm)->iv; - return crypto_blkcipher_crt(desc->tfm)->encrypt(desc, dst, src, nbytes); -} - -/** - * crypto_blkcipher_encrypt_iv() - encrypt plaintext with dedicated IV - * @desc: reference to the block cipher handle with meta data - * @dst: scatter/gather list that is filled by the cipher operation with the - * ciphertext - * @src: scatter/gather list that holds the plaintext - * @nbytes: number of bytes of the plaintext to encrypt. - * - * Encrypt plaintext data with the use of an IV that is solely used for this - * cipher operation. Any previously set IV is not used. - * - * The blkcipher_desc data structure must be filled by the caller and can - * reside on the stack. The caller must fill desc as follows: desc.tfm is filled - * with the block cipher handle; desc.info is filled with the IV to be used for - * the current operation; desc.flags is filled with either - * CRYPTO_TFM_REQ_MAY_SLEEP or 0. - * - * Return: 0 if the cipher operation was successful; < 0 if an error occurred - */ -static inline int crypto_blkcipher_encrypt_iv(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes) -{ - return crypto_blkcipher_crt(desc->tfm)->encrypt(desc, dst, src, nbytes); -} - -/** - * crypto_blkcipher_decrypt() - decrypt ciphertext - * @desc: reference to the block cipher handle with meta data - * @dst: scatter/gather list that is filled by the cipher operation with the - * plaintext - * @src: scatter/gather list that holds the ciphertext - * @nbytes: number of bytes of the ciphertext to decrypt. - * - * Decrypt ciphertext data using the IV set by the caller with a preceding - * call of crypto_blkcipher_set_iv. - * - * The blkcipher_desc data structure must be filled by the caller as documented - * for the crypto_blkcipher_encrypt call above. - * - * Return: 0 if the cipher operation was successful; < 0 if an error occurred - * - */ -static inline int crypto_blkcipher_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes) -{ - desc->info = crypto_blkcipher_crt(desc->tfm)->iv; - return crypto_blkcipher_crt(desc->tfm)->decrypt(desc, dst, src, nbytes); -} - -/** - * crypto_blkcipher_decrypt_iv() - decrypt ciphertext with dedicated IV - * @desc: reference to the block cipher handle with meta data - * @dst: scatter/gather list that is filled by the cipher operation with the - * plaintext - * @src: scatter/gather list that holds the ciphertext - * @nbytes: number of bytes of the ciphertext to decrypt. - * - * Decrypt ciphertext data with the use of an IV that is solely used for this - * cipher operation. Any previously set IV is not used. - * - * The blkcipher_desc data structure must be filled by the caller as documented - * for the crypto_blkcipher_encrypt_iv call above. - * - * Return: 0 if the cipher operation was successful; < 0 if an error occurred - */ -static inline int crypto_blkcipher_decrypt_iv(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes) -{ - return crypto_blkcipher_crt(desc->tfm)->decrypt(desc, dst, src, nbytes); -} - -/** - * crypto_blkcipher_set_iv() - set IV for cipher - * @tfm: cipher handle - * @src: buffer holding the IV - * @len: length of the IV in bytes - * - * The caller provided IV is set for the block cipher referenced by the cipher - * handle. - */ -static inline void crypto_blkcipher_set_iv(struct crypto_blkcipher *tfm, - const u8 *src, unsigned int len) -{ - memcpy(crypto_blkcipher_crt(tfm)->iv, src, len); -} - -/** - * crypto_blkcipher_get_iv() - obtain IV from cipher - * @tfm: cipher handle - * @dst: buffer filled with the IV - * @len: length of the buffer dst - * - * The caller can obtain the IV set for the block cipher referenced by the - * cipher handle and store it into the user-provided buffer. If the buffer - * has an insufficient space, the IV is truncated to fit the buffer. - */ -static inline void crypto_blkcipher_get_iv(struct crypto_blkcipher *tfm, - u8 *dst, unsigned int len) -{ - memcpy(dst, crypto_blkcipher_crt(tfm)->iv, len); -} - /** * DOC: Single Block Cipher API * -- cgit v1.2.3 From 8b5369ea580964dbc982781bfb9fb93459fc5e8d Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Mon, 14 Oct 2019 20:31:03 +0200 Subject: dma/direct: turn ARCH_ZONE_DMA_BITS into a variable Some architectures, notably ARM, are interested in tweaking this depending on their runtime DMA addressing limitations. Acked-by: Christoph Hellwig Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Catalin Marinas --- include/linux/dma-direct.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index adf993a3bd58..d03af3605460 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -5,6 +5,8 @@ #include #include +extern unsigned int zone_dma_bits; + #ifdef CONFIG_ARCH_HAS_PHYS_TO_DMA #include #else -- cgit v1.2.3 From 73727f4dafa2df107e85753c5ab703a1f344e1f1 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 30 Oct 2019 16:43:10 +0100 Subject: livepatch: Basic API to track system state changes This is another step how to help maintaining more livepatches. One big help was the atomic replace and cumulative livepatches. These livepatches replace the already installed ones. Therefore it should be enough when each cumulative livepatch is consistent. The problems might come with shadow variables and callbacks. They might change the system behavior or state so that it is no longer safe to go back and use an older livepatch or the original kernel code. Also, a new livepatch must be able to detect changes which were made by the already installed livepatches. This is where the livepatch system state tracking gets useful. It allows to: - find whether a system state has already been modified by previous livepatches - store data needed to manipulate and restore the system state The information about the manipulated system states is stored in an array of struct klp_state. It can be searched by two new functions klp_get_state() and klp_get_prev_state(). The dependencies are going to be solved by a version field added later. The only important information is that it will be allowed to modify the same state by more non-cumulative livepatches. It is similar to allowing to modify the same function several times. The livepatch author is responsible for preventing incompatible changes. Link: http://lkml.kernel.org/r/20191030154313.13263-3-pmladek@suse.com To: Jiri Kosina Cc: Kamalesh Babulal Cc: Nicolai Stange Cc: live-patching@vger.kernel.org Cc: linux-kernel@vger.kernel.org Acked-by: Miroslav Benes Acked-by: Joe Lawrence Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- include/linux/livepatch.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 273400814020..726947338fd5 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -130,10 +130,21 @@ struct klp_object { bool patched; }; +/** + * struct klp_state - state of the system modified by the livepatch + * @id: system state identifier (non-zero) + * @data: custom data + */ +struct klp_state { + unsigned long id; + void *data; +}; + /** * struct klp_patch - patch structure for live patching * @mod: reference to the live patch module * @objs: object entries for kernel objects to be patched + * @states: system states that can get modified * @replace: replace all actively used patches * @list: list node for global list of actively used patches * @kobj: kobject for sysfs resources @@ -147,6 +158,7 @@ struct klp_patch { /* external */ struct module *mod; struct klp_object *objs; + struct klp_state *states; bool replace; /* internal */ @@ -217,6 +229,9 @@ void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor); void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor); +struct klp_state *klp_get_state(struct klp_patch *patch, unsigned long id); +struct klp_state *klp_get_prev_state(unsigned long id); + #else /* !CONFIG_LIVEPATCH */ static inline int klp_module_coming(struct module *mod) { return 0; } -- cgit v1.2.3 From 92c9abf5e57500ea7dc59a55273aa7850b631bda Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 30 Oct 2019 16:43:11 +0100 Subject: livepatch: Allow to distinguish different version of system state changes The atomic replace runs pre/post (un)install callbacks only from the new livepatch. There are several reasons for this: + Simplicity: clear ordering of operations, no interactions between old and new callbacks. + Reliability: only new livepatch knows what changes can already be made by older livepatches and how to take over the state. + Testing: the atomic replace can be properly tested only when a newer livepatch is available. It might be too late to fix unwanted effect of callbacks from older livepatches. It might happen that an older change is not enough and the same system state has to be modified another way. Different changes need to get distinguished by a version number added to struct klp_state. The version can also be used to prevent loading incompatible livepatches. The check is done when the livepatch is enabled. The rules are: + Any completely new system state modification is allowed. + System state modifications with the same or higher version are allowed for already modified system states. + Cumulative livepatches must handle all system state modifications from already installed livepatches. + Non-cumulative livepatches are allowed to touch already modified system states. Link: http://lkml.kernel.org/r/20191030154313.13263-4-pmladek@suse.com To: Jiri Kosina Cc: Kamalesh Babulal Cc: Nicolai Stange Cc: live-patching@vger.kernel.org Cc: linux-kernel@vger.kernel.org Acked-by: Miroslav Benes Acked-by: Joe Lawrence Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- include/linux/livepatch.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 726947338fd5..e894e74905f3 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -133,10 +133,12 @@ struct klp_object { /** * struct klp_state - state of the system modified by the livepatch * @id: system state identifier (non-zero) + * @version: version of the change * @data: custom data */ struct klp_state { unsigned long id; + unsigned int version; void *data; }; -- cgit v1.2.3 From 62755e35dfb2b113c52b81cd96d01c20971c8e02 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Oct 2019 21:49:21 -0600 Subject: io_uring: support for generic async request cancel This adds support for IORING_OP_ASYNC_CANCEL, which will attempt to cancel requests that have been punted to async context and are now in-flight. This works for regular read/write requests to files, as long as they haven't been started yet. For socket based IO (or things like accept4(2)), we can cancel work that is already running as well. To cancel a request, the sqe must have ->addr set to the user_data of the request it wishes to cancel. If the request is cancelled successfully, the original request is completed with -ECANCELED and the cancel request is completed with a result of 0. If the request was already running, the original may or may not complete in error. The cancel request will complete with -EALREADY for that case. And finally, if the request to cancel wasn't found, the cancel request is completed with -ENOENT. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f82d90e617a6..6877cf8894db 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -33,6 +33,7 @@ struct io_uring_sqe { __u32 msg_flags; __u32 timeout_flags; __u32 accept_flags; + __u32 cancel_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -70,6 +71,7 @@ struct io_uring_sqe { #define IORING_OP_TIMEOUT 11 #define IORING_OP_TIMEOUT_REMOVE 12 #define IORING_OP_ACCEPT 13 +#define IORING_OP_ASYNC_CANCEL 14 /* * sqe->fsync_flags -- cgit v1.2.3 From 626fb735a43ddcb7b2c58c27cb03b098acc03339 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 30 Oct 2019 00:59:30 +0800 Subject: blk-mq: Make blk_mq_run_hw_queue() return void Since commit 97889f9ac24f ("blk-mq: remove synchronize_rcu() from blk_mq_del_queue_tag_set()"), the return value of blk_mq_run_hw_queue() is never checked, so make it return void, which very marginally simplifies the code. Reviewed-by: Bob Liu Signed-off-by: John Garry Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4919d22e1aff..dc03e059fdff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -502,7 +502,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); -- cgit v1.2.3 From 25937580a5065d6fbd92d9c8ebd47145ad80052e Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 31 Oct 2019 10:59:44 +0100 Subject: ata: define AC_ERR_OK Since we will return enum ata_completion_errors from qc_prep in the next patch, let's define AC_ERR_OK to mark the OK status. Signed-off-by: Jiri Slaby Cc: Jens Axboe Cc: linux-ide@vger.kernel.org Signed-off-by: Jens Axboe --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index 207e7ee764ce..b63ce4ebcd66 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -484,6 +484,7 @@ enum hsm_task_states { }; enum ata_completion_errors { + AC_ERR_OK = 0, /* no error */ AC_ERR_DEV = (1 << 0), /* device reported error */ AC_ERR_HSM = (1 << 1), /* host state machine violation */ AC_ERR_TIMEOUT = (1 << 2), /* timeout */ -- cgit v1.2.3 From 95364f36701e62dd50eee91e1303187fd1a9f567 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 31 Oct 2019 10:59:45 +0100 Subject: ata: make qc_prep return ata_completion_errors In case a driver wants to return an error from qc_prep, return enum ata_completion_errors. sata_mv is one of those drivers -- see the next patch. Other drivers return the newly defined AC_ERR_OK. [v2] use enum ata_completion_errors and AC_ERR_OK. Signed-off-by: Jiri Slaby Cc: Jens Axboe Cc: linux-ide@vger.kernel.org Signed-off-by: Jens Axboe --- include/linux/libata.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index b63ce4ebcd66..d3bbfddf616a 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -892,9 +892,9 @@ struct ata_port_operations { /* * Command execution */ - int (*qc_defer)(struct ata_queued_cmd *qc); - int (*check_atapi_dma)(struct ata_queued_cmd *qc); - void (*qc_prep)(struct ata_queued_cmd *qc); + int (*qc_defer)(struct ata_queued_cmd *qc); + int (*check_atapi_dma)(struct ata_queued_cmd *qc); + enum ata_completion_errors (*qc_prep)(struct ata_queued_cmd *qc); unsigned int (*qc_issue)(struct ata_queued_cmd *qc); bool (*qc_fill_rtf)(struct ata_queued_cmd *qc); @@ -1162,7 +1162,7 @@ extern int ata_xfer_mode2shift(unsigned long xfer_mode); extern const char *ata_mode_string(unsigned long xfer_mask); extern unsigned long ata_id_xfermask(const u16 *id); extern int ata_std_qc_defer(struct ata_queued_cmd *qc); -extern void ata_noop_qc_prep(struct ata_queued_cmd *qc); +extern enum ata_completion_errors ata_noop_qc_prep(struct ata_queued_cmd *qc); extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg, unsigned int n_elem); extern unsigned int ata_dev_classify(const struct ata_taskfile *tf); @@ -1894,9 +1894,9 @@ extern const struct ata_port_operations ata_bmdma_port_ops; .sg_tablesize = LIBATA_MAX_PRD, \ .dma_boundary = ATA_DMA_BOUNDARY -extern void ata_bmdma_qc_prep(struct ata_queued_cmd *qc); +extern enum ata_completion_errors ata_bmdma_qc_prep(struct ata_queued_cmd *qc); extern unsigned int ata_bmdma_qc_issue(struct ata_queued_cmd *qc); -extern void ata_bmdma_dumb_qc_prep(struct ata_queued_cmd *qc); +extern enum ata_completion_errors ata_bmdma_dumb_qc_prep(struct ata_queued_cmd *qc); extern unsigned int ata_bmdma_port_intr(struct ata_port *ap, struct ata_queued_cmd *qc); extern irqreturn_t ata_bmdma_interrupt(int irq, void *dev_instance); -- cgit v1.2.3 From be0c5677970d4f21dc701136a178437aad9983b2 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 1 Nov 2019 14:46:37 +0200 Subject: net: bridge: fdb: br_fdb_update can take flags directly If we modify br_fdb_update() to take flags directly we can get rid of one test and one atomic bitop in the learning path. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/trace/events/bridge.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h index 8ea966448b58..6b200059c2c5 100644 --- a/include/trace/events/bridge.h +++ b/include/trace/events/bridge.h @@ -95,16 +95,16 @@ TRACE_EVENT(fdb_delete, TRACE_EVENT(br_fdb_update, TP_PROTO(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid, bool added_by_user), + const unsigned char *addr, u16 vid, unsigned long flags), - TP_ARGS(br, source, addr, vid, added_by_user), + TP_ARGS(br, source, addr, vid, flags), TP_STRUCT__entry( __string(br_dev, br->dev->name) __string(dev, source->dev->name) __array(unsigned char, addr, ETH_ALEN) __field(u16, vid) - __field(bool, added_by_user) + __field(unsigned long, flags) ), TP_fast_assign( @@ -112,14 +112,14 @@ TRACE_EVENT(br_fdb_update, __assign_str(dev, source->dev->name); memcpy(__entry->addr, addr, ETH_ALEN); __entry->vid = vid; - __entry->added_by_user = added_by_user; + __entry->flags = flags; ), - TP_printk("br_dev %s source %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u added_by_user %d", + TP_printk("br_dev %s source %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u flags 0x%lx", __get_str(br_dev), __get_str(dev), __entry->addr[0], __entry->addr[1], __entry->addr[2], __entry->addr[3], __entry->addr[4], __entry->addr[5], __entry->vid, - __entry->added_by_user) + __entry->flags) ); -- cgit v1.2.3 From 0069fc6b1cf28de3a3890ed7c87a5b8ab79ca528 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 1 Nov 2019 12:44:40 -0600 Subject: io_uring: remove io_uring_add_to_prev() trace event This internal logic was killed with the conversion to io-wq, so we no longer have a need for this particular trace. Kill it. Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 29 ----------------------------- 1 file changed, 29 deletions(-) (limited to 'include') diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index b85255121b98..8f21d8bf20fd 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -227,35 +227,6 @@ TRACE_EVENT(io_uring_link, __entry->ctx, __entry->req, __entry->target_req) ); -/** - * io_uring_add_to_prev - called after a request was added into a previously - * submitted work - * - * @req: pointer to a request, added to a previous - * @ret: whether or not it was completed successfully - * - * Allows to track merged work, to figure out how often requests are piggy - * backed into other ones, changing the execution flow. - */ -TRACE_EVENT(io_uring_add_to_prev, - - TP_PROTO(void *req, bool ret), - - TP_ARGS(req, ret), - - TP_STRUCT__entry ( - __field( void *, req ) - __field( bool, ret ) - ), - - TP_fast_assign( - __entry->req = req; - __entry->ret = ret; - ), - - TP_printk("request %p, ret %d", __entry->req, __entry->ret) -); - /** * io_uring_cqring_wait - called before start waiting for an available CQE * -- cgit v1.2.3 From e53a9d26cf80565cfb7172fc52a0dfac73613a0f Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 28 Oct 2019 23:35:30 +0000 Subject: IB/mlx5: Introduce and use mlx5_core_is_vf() Instead of deciding a given device is virtual function or not based on a device is PF or not, use already defined MLX5_COREDEV_VF by introducing an helper API mlx5_core_is_vf(). This enables to clearly identify PF, VF and non virtual functions. Signed-off-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 3e80f03a387f..7b4801e96feb 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1121,6 +1121,11 @@ static inline bool mlx5_core_is_pf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_PF; } +static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev) +{ + return dev->coredev_type == MLX5_COREDEV_VF; +} + static inline bool mlx5_core_is_ecpf(struct mlx5_core_dev *dev) { return dev->caps.embedded_cpu; -- cgit v1.2.3 From d817991cc7486ab83f6c7188b0bc80eebee872f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 1 Nov 2019 12:03:46 +0100 Subject: xsk: Restructure/inline XSKMAP lookup/redirect/flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit the XSKMAP entry lookup function used by the XDP redirect code is moved from the xskmap.c file to the xdp_sock.h header, so the lookup can be inlined from, e.g., the bpf_xdp_redirect_map() function. Further the __xsk_map_redirect() and __xsk_map_flush() is moved to the xsk.c, which lets the compiler inline the xsk_rcv() and xsk_flush() functions. Finally, all the XDP socket functions were moved from linux/bpf.h to net/xdp_sock.h, where most of the XDP sockets functions are anyway. This yields a ~2% performance boost for the xdpsock "rx_drop" scenario. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191101110346.15004-4-bjorn.topel@gmail.com --- include/linux/bpf.h | 25 ------------------------- include/net/xdp_sock.h | 51 ++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80158cff44bd..7c7f518811a6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1009,31 +1009,6 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr, } #endif -#if defined(CONFIG_XDP_SOCKETS) -struct xdp_sock; -struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key); -int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs); -void __xsk_map_flush(struct bpf_map *map); -#else -struct xdp_sock; -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} - -static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, - struct xdp_sock *xs) -{ - return -EOPNOTSUPP; -} - -static inline void __xsk_map_flush(struct bpf_map *map) -{ -} -#endif - #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) void bpf_sk_reuseport_detach(struct sock *sk); int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index c9398ce7960f..e3780e4b74e1 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -69,7 +69,14 @@ struct xdp_umem { /* Nodes are linked in the struct xdp_sock map_list field, and used to * track which maps a certain socket reside in. */ -struct xsk_map; + +struct xsk_map { + struct bpf_map map; + struct list_head __percpu *flush_list; + spinlock_t lock; /* Synchronize map updates */ + struct xdp_sock *xsk_map[]; +}; + struct xsk_map_node { struct list_head node; struct xsk_map *map; @@ -109,8 +116,6 @@ struct xdp_sock { struct xdp_buff; #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); -int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); -void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); /* Used from netdev driver */ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); @@ -134,6 +139,22 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry); int xsk_map_inc(struct xsk_map *map); void xsk_map_put(struct xsk_map *map); +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs); +void __xsk_map_flush(struct bpf_map *map); + +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs; + + if (key >= map->max_entries) + return NULL; + + xs = READ_ONCE(m->xsk_map[key]); + return xs; +} static inline u64 xsk_umem_extract_addr(u64 addr) { @@ -224,15 +245,6 @@ static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return -ENOTSUPP; } -static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) -{ - return -ENOTSUPP; -} - -static inline void xsk_flush(struct xdp_sock *xs) -{ -} - static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return false; @@ -357,6 +369,21 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, return 0; } +static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + return -EOPNOTSUPP; +} + +static inline void __xsk_map_flush(struct bpf_map *map) +{ +} + +static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ -- cgit v1.2.3 From 1d1585ca0f48fe7ed95c3571f3e4a82b2b5045dc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:56 +0100 Subject: uaccess: Add non-pagefault user-space write function Commit 3d7081822f7f ("uaccess: Add non-pagefault user-space read functions") missed to add probe write function, therefore factor out a probe_write_common() helper with most logic of probe_kernel_write() except setting KERNEL_DS, and add a new probe_user_write() helper so it can be used from BPF side. Again, on some archs, the user address space and kernel address space can co-exist and be overlapping, so in such case, setting KERNEL_DS would mean that the given address is treated as being in kernel address space. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: Masami Hiramatsu Link: https://lore.kernel.org/bpf/9df2542e68141bfa3addde631441ee45503856a8.1572649915.git.daniel@iogearbox.net --- include/linux/uaccess.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index d4ee6e942562..38555435a64a 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -337,6 +337,18 @@ extern long __probe_user_read(void *dst, const void __user *src, size_t size); extern long notrace probe_kernel_write(void *dst, const void *src, size_t size); extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size); +/* + * probe_user_write(): safely attempt to write to a location in user space + * @dst: address to write to + * @src: pointer to the data that shall be written + * @size: size of the data chunk + * + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ +extern long notrace probe_user_write(void __user *dst, const void *src, size_t size); +extern long notrace __probe_user_write(void __user *dst, const void *src, size_t size); + extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, long count); -- cgit v1.2.3 From 75a1a607bb7e6d918be3aca11ec2214a275392f4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:57 +0100 Subject: uaccess: Add strict non-pagefault kernel-space read function Add two new probe_kernel_read_strict() and strncpy_from_unsafe_strict() helpers which by default alias to the __probe_kernel_read() and the __strncpy_from_unsafe(), respectively, but can be overridden by archs which have non-overlapping address ranges for kernel space and user space in order to bail out with -EFAULT when attempting to probe user memory including non-canonical user access addresses [0]: 4-level page tables: user-space mem: 0x0000000000000000 - 0x00007fffffffffff non-canonical: 0x0000800000000000 - 0xffff7fffffffffff 5-level page tables: user-space mem: 0x0000000000000000 - 0x00ffffffffffffff non-canonical: 0x0100000000000000 - 0xfeffffffffffffff The idea is that these helpers are complementary to the probe_user_read() and strncpy_from_unsafe_user() which probe user-only memory. Both added helpers here do the same, but for kernel-only addresses. Both set of helpers are going to be used for BPF tracing. They also explicitly avoid throwing the splat for non-canonical user addresses from 00c42373d397 ("x86-64: add warning for non-canonical user access address dereferences"). For compat, the current probe_kernel_read() and strncpy_from_unsafe() are left as-is. [0] Documentation/x86/x86_64/mm.txt Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: x86@kernel.org Link: https://lore.kernel.org/bpf/eefeefd769aa5a013531f491a71f0936779e916b.1572649915.git.daniel@iogearbox.net --- include/linux/uaccess.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 38555435a64a..67f016010aad 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -311,6 +311,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src, * happens, handle that and return -EFAULT. */ extern long probe_kernel_read(void *dst, const void *src, size_t size); +extern long probe_kernel_read_strict(void *dst, const void *src, size_t size); extern long __probe_kernel_read(void *dst, const void *src, size_t size); /* @@ -350,6 +351,9 @@ extern long notrace probe_user_write(void __user *dst, const void *src, size_t s extern long notrace __probe_user_write(void __user *dst, const void *src, size_t size); extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); +extern long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, + long count); +extern long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, long count); extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); -- cgit v1.2.3 From 6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:59 +0100 Subject: bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers The current bpf_probe_read() and bpf_probe_read_str() helpers are broken in that they assume they can be used for probing memory access for kernel space addresses /as well as/ user space addresses. However, plain use of probe_kernel_read() for both cases will attempt to always access kernel space address space given access is performed under KERNEL_DS and some archs in-fact have overlapping address spaces where a kernel pointer and user pointer would have the /same/ address value and therefore accessing application memory via bpf_probe_read{,_str}() would read garbage values. Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess: Add non-pagefault user-space read functions"). Unfortunately, the only way to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}() helpers are kept as-is to retain their current behavior. The two *_user() variants attempt the access always under USER_DS set, the two *_kernel() variants will -EFAULT when accessing user memory if the underlying architecture has non-overlapping address ranges, also avoiding throwing the kernel warning via 00c42373d397 ("x86-64: add warning for non-canonical user access address dereferences"). Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper") Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 122 +++++++++++++++++++++++++++++++---------------- 1 file changed, 82 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a6bf19dabaab..df6809a76404 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -563,10 +563,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *src) + * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from - * address *src* and store the data in *dst*. + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() + * instead. * Return * 0 on success, or a negative error in case of failure. * @@ -1428,45 +1431,14 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) + * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description - * Copy a NUL terminated string from an unsafe address - * *unsafe_ptr* to *dst*. The *size* should include the - * terminating NUL byte. In case the string length is smaller than - * *size*, the target is not padded with further NUL bytes. If the - * string length is larger than *size*, just *size*-1 bytes are - * copied and the last byte is set to NUL. - * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: - * - * :: - * - * SEC("kprobe/sys_open") - * void bpf_sys_open(struct pt_regs *ctx) - * { - * char buf[PATHLEN]; // PATHLEN is defined to 256 - * int res = bpf_probe_read_str(buf, sizeof(buf), - * ctx->di); - * - * // Consume buf, for example push it to - * // userspace via bpf_perf_event_output(); we - * // can use res (the string length) as event - * // size, after checking its boundaries. - * } - * - * In comparison, using **bpf_probe_read()** helper here instead - * to read the string would require to estimate the length at - * compile time, and would often result in copying more memory - * than necessary. + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for + * more details. * - * Another useful use case is when parsing individual process - * arguments or individual environment variables navigating - * *current*\ **->mm->arg_start** and *current*\ - * **->mm->env_start**: using this helper and the return value, - * one can quickly iterate at the right offset of the memory area. + * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() + * instead. * Return * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative @@ -2777,6 +2749,72 @@ union bpf_attr { * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, the length of the copied string is returned. This + * makes this helper useful in tracing programs for reading + * strings, and more importantly to get its length at runtime. See + * the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user()** helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. + * Return + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2890,7 +2928,11 @@ union bpf_attr { FN(sk_storage_delete), \ FN(send_signal), \ FN(tcp_gen_syncookie), \ - FN(skb_output), + FN(skb_output), \ + FN(probe_read_user), \ + FN(probe_read_kernel), \ + FN(probe_read_user_str), \ + FN(probe_read_kernel_str), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 51c3ff62cac635ae9d75f875ce5b7bdafc97abd5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 3 Nov 2019 06:52:50 -0700 Subject: io_uring: add completion trace event We currently don't have a completion event trace, add one of those. And to better be able to match up submissions and completions, add user_data to the submission trace as well. Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 54 ++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 8f21d8bf20fd..72a4d0174b02 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -284,10 +284,43 @@ TRACE_EVENT(io_uring_fail_link, TP_printk("request %p, link %p", __entry->req, __entry->link) ); +/** + * io_uring_complete - called when completing an SQE + * + * @ctx: pointer to a ring context structure + * @user_data: user data associated with the request + * @res: result of the request + * + */ +TRACE_EVENT(io_uring_complete, + + TP_PROTO(void *ctx, u64 user_data, long res), + + TP_ARGS(ctx, user_data, res), + + TP_STRUCT__entry ( + __field( void *, ctx ) + __field( u64, user_data ) + __field( long, res ) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->user_data = user_data; + __entry->res = res; + ), + + TP_printk("ring %p, user_data 0x%llx, result %ld", + __entry->ctx, (unsigned long long)__entry->user_data, + __entry->res) +); + + /** * io_uring_submit_sqe - called before submitting one SQE * - * @ctx: pointer to a ring context structure + * @ctx: pointer to a ring context structure + * @user_data: user data associated with the request * @force_nonblock: whether a context blocking or not * @sq_thread: true if sq_thread has submitted this SQE * @@ -296,24 +329,27 @@ TRACE_EVENT(io_uring_fail_link, */ TRACE_EVENT(io_uring_submit_sqe, - TP_PROTO(void *ctx, bool force_nonblock, bool sq_thread), + TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread), - TP_ARGS(ctx, force_nonblock, sq_thread), + TP_ARGS(ctx, user_data, force_nonblock, sq_thread), TP_STRUCT__entry ( - __field( void *, ctx ) + __field( void *, ctx ) + __field( u64, user_data ) __field( bool, force_nonblock ) - __field( bool, sq_thread ) + __field( bool, sq_thread ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = ctx; + __entry->user_data = user_data; __entry->force_nonblock = force_nonblock; - __entry->sq_thread = sq_thread; + __entry->sq_thread = sq_thread; ), - TP_printk("ring %p, non block %d, sq_thread %d", - __entry->ctx, __entry->force_nonblock, __entry->sq_thread) + TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d", + __entry->ctx, (unsigned long long) __entry->user_data, + __entry->force_nonblock, __entry->sq_thread) ); #endif /* _TRACE_IO_URING_H */ -- cgit v1.2.3 From 51f421c85c880dcb37df11e672b384eaa4444328 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 31 Oct 2019 10:13:21 -0600 Subject: block: sed-opal: Add support to read/write opal tables generically This feature gives the user RW access to any opal table with admin1 authority. The flags described in the new structure determines if the user wants to read/write the data. Flags are checked for valid values in order to allow future features to be added to the ioctl. The user can provide the desired table's UID. Also, the ioctl provides a size and offset field and internally will loop data accesses to return the full data block. Read overrun is prevented by the initiator's sec_send_recv() backend. The ioctl provides a private field with the intention to accommodate any future expansions to the ioctl. Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 53c28d750a45..1ac0d712a9c3 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -42,6 +42,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_PSID_REVERT_TPR: case IOC_OPAL_MBR_DONE: case IOC_OPAL_WRITE_SHADOW_MBR: + case IOC_OPAL_GENERIC_TABLE_RW: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index c6d035fa1b6c..6f5af1a84213 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -113,6 +113,25 @@ struct opal_shadow_mbr { __u64 size; }; +/* Opal table operations */ +enum opal_table_ops { + OPAL_READ_TABLE, + OPAL_WRITE_TABLE, +}; + +#define OPAL_UID_LENGTH 8 +struct opal_read_write_table { + struct opal_key key; + const __u64 data; + const __u8 table_uid[OPAL_UID_LENGTH]; + __u64 offset; + __u64 size; +#define OPAL_TABLE_READ (1 << OPAL_READ_TABLE) +#define OPAL_TABLE_WRITE (1 << OPAL_WRITE_TABLE) + __u64 flags; + __u64 priv; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -128,5 +147,6 @@ struct opal_shadow_mbr { #define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key) #define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) #define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr) +#define IOC_OPAL_GENERIC_TABLE_RW _IOW('p', 235, struct opal_read_write_table) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From f16583614222d015968541f2e50447c67c277f74 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 27 Sep 2019 14:51:34 -0700 Subject: nvme-fc: Sync nvme-fc header to FC-NVME-2 Sync the header to FC-NVME-2 r1.06 (T11-2019-00210-v001). Includes some minor mods where pre-release field names changed by the time the spec was released. Signed-off-by: James Smart Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme-fc.h | 182 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 137 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index 067c9fea64fe..e8c30b39bb27 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -4,33 +4,60 @@ */ /* - * This file contains definitions relative to FC-NVME r1.14 (16-020vB). - * The fcnvme_lsdesc_cr_assoc_cmd struct reflects expected r1.16 content. + * This file contains definitions relative to FC-NVME-2 r1.06 + * (T11-2019-00210-v001). */ #ifndef _NVME_FC_H #define _NVME_FC_H 1 +#include -#define NVME_CMD_SCSI_ID 0xFD +#define NVME_CMD_FORMAT_ID 0xFD #define NVME_CMD_FC_ID FC_TYPE_NVME /* FC-NVME Cmd IU Flags */ -#define FCNVME_CMD_FLAGS_DIRMASK 0x03 -#define FCNVME_CMD_FLAGS_WRITE 0x01 -#define FCNVME_CMD_FLAGS_READ 0x02 +enum { + FCNVME_CMD_FLAGS_DIRMASK = 0x03, + FCNVME_CMD_FLAGS_WRITE = (1 << 0), + FCNVME_CMD_FLAGS_READ = (1 << 1), + + FCNVME_CMD_FLAGS_PICWP = (1 << 2), +}; + +enum { + FCNVME_CMD_CAT_MASK = 0x0F, + FCNVME_CMD_CAT_ADMINQ = 0x01, + FCNVME_CMD_CAT_CSSMASK = 0x07, + FCNVME_CMD_CAT_CSSFLAG = 0x08, +}; + +static inline __u8 fccmnd_set_cat_admin(__u8 rsv_cat) +{ + return (rsv_cat & ~FCNVME_CMD_CAT_MASK) | FCNVME_CMD_CAT_ADMINQ; +} + +static inline __u8 fccmnd_set_cat_css(__u8 rsv_cat, __u8 css) +{ + return (rsv_cat & ~FCNVME_CMD_CAT_MASK) | FCNVME_CMD_CAT_CSSFLAG | + (css & FCNVME_CMD_CAT_CSSMASK); +} struct nvme_fc_cmd_iu { - __u8 scsi_id; + __u8 format_id; __u8 fc_id; __be16 iu_len; - __u8 rsvd4[3]; + __u8 rsvd4[2]; + __u8 rsv_cat; __u8 flags; __be64 connection_id; __be32 csn; __be32 data_len; struct nvme_command sqe; - __be32 rsvd88[2]; + __u8 dps; + __u8 lbads; + __be16 ms; + __be32 rsvd92; }; #define NVME_FC_SIZEOF_ZEROS_RSP 12 @@ -38,11 +65,12 @@ struct nvme_fc_cmd_iu { enum { FCNVME_SC_SUCCESS = 0, FCNVME_SC_INVALID_FIELD = 1, - FCNVME_SC_INVALID_CONNID = 2, + /* reserved 2 */ + FCNVME_SC_ILL_CONN_PARAMS = 3, }; struct nvme_fc_ersp_iu { - __u8 status_code; + __u8 ersp_result; __u8 rsvd1; __be16 iu_len; __be32 rsn; @@ -53,14 +81,44 @@ struct nvme_fc_ersp_iu { }; -/* FC-NVME Link Services */ +#define FCNVME_NVME_SR_OPCODE 0x01 + +struct nvme_fc_nvme_sr_iu { + __u8 fc_id; + __u8 opcode; + __u8 rsvd2; + __u8 retry_rctl; + __be32 rsvd4; +}; + + +enum { + FCNVME_SRSTAT_ACC = 0x0, + FCNVME_SRSTAT_INV_FCID = 0x1, + /* reserved 0x2 */ + FCNVME_SRSTAT_LOGICAL_ERR = 0x3, + FCNVME_SRSTAT_INV_QUALIF = 0x4, + FCNVME_SRSTAT_UNABL2PERFORM = 0x9, +}; + +struct nvme_fc_nvme_sr_rsp_iu { + __u8 fc_id; + __u8 opcode; + __u8 rsvd2; + __u8 status; + __be32 rsvd4; +}; + + +/* FC-NVME Link Services - LS cmd values (w0 bits 31:24) */ enum { FCNVME_LS_RSVD = 0, FCNVME_LS_RJT = 1, FCNVME_LS_ACC = 2, - FCNVME_LS_CREATE_ASSOCIATION = 3, - FCNVME_LS_CREATE_CONNECTION = 4, - FCNVME_LS_DISCONNECT = 5, + FCNVME_LS_CREATE_ASSOCIATION = 3, /* Create Association */ + FCNVME_LS_CREATE_CONNECTION = 4, /* Create I/O Connection */ + FCNVME_LS_DISCONNECT_ASSOC = 5, /* Disconnect Association */ + FCNVME_LS_DISCONNECT_CONN = 6, /* Disconnect Connection */ }; /* FC-NVME Link Service Descriptors */ @@ -117,14 +175,17 @@ enum fcnvme_ls_rjt_reason { FCNVME_RJT_RC_UNSUP = 0x0b, /* command not supported */ - FCNVME_RJT_RC_INPROG = 0x0e, - /* command already in progress */ - FCNVME_RJT_RC_INV_ASSOC = 0x40, - /* Invalid Association ID*/ + /* Invalid Association ID */ FCNVME_RJT_RC_INV_CONN = 0x41, - /* Invalid Connection ID*/ + /* Invalid Connection ID */ + + FCNVME_RJT_RC_INV_PARAM = 0x42, + /* Invalid Parameters */ + + FCNVME_RJT_RC_INSUF_RES = 0x43, + /* Insufficient Resources */ FCNVME_RJT_RC_VENDOR = 0xff, /* vendor specific error */ @@ -138,14 +199,32 @@ enum fcnvme_ls_rjt_explan { FCNVME_RJT_EXP_OXID_RXID = 0x17, /* invalid OX_ID-RX_ID combination */ - FCNVME_RJT_EXP_INSUF_RES = 0x29, - /* insufficient resources */ - FCNVME_RJT_EXP_UNAB_DATA = 0x2a, /* unable to supply requested data */ FCNVME_RJT_EXP_INV_LEN = 0x2d, /* Invalid payload length */ + + FCNVME_RJT_EXP_INV_ERSP_RAT = 0x40, + /* Invalid NVMe_ERSP Ratio */ + + FCNVME_RJT_EXP_INV_CTLR_ID = 0x41, + /* Invalid Controller ID */ + + FCNVME_RJT_EXP_INV_QUEUE_ID = 0x42, + /* Invalid Queue ID */ + + FCNVME_RJT_EXP_INV_SQSIZE = 0x43, + /* Invalid Submission Queue Size */ + + FCNVME_RJT_EXP_INV_HOSTID = 0x44, + /* Invalid HOST ID */ + + FCNVME_RJT_EXP_INV_HOSTNQN = 0x45, + /* Invalid HOSTNQN */ + + FCNVME_RJT_EXP_INV_SUBNQN = 0x46, + /* Invalid SUBNQN */ }; /* FCNVME_LSDESC_RJT */ @@ -209,21 +288,11 @@ struct fcnvme_lsdesc_cr_conn_cmd { __be32 rsvd52; }; -/* Disconnect Scope Values */ -enum { - FCNVME_DISCONN_ASSOCIATION = 0, - FCNVME_DISCONN_CONNECTION = 1, -}; - /* FCNVME_LSDESC_DISCONN_CMD */ struct fcnvme_lsdesc_disconn_cmd { __be32 desc_tag; /* FCNVME_LSDESC_xxx */ __be32 desc_len; - u8 rsvd8[3]; - /* note: scope is really a 1 bit field */ - u8 scope; /* FCNVME_DISCONN_xxx */ - __be32 rsvd12; - __be64 id; + __be32 rsvd8[4]; }; /* FCNVME_LSDESC_CONN_ID */ @@ -242,9 +311,14 @@ struct fcnvme_lsdesc_assoc_id { /* r_ctl values */ enum { - FCNVME_RS_RCTL_DATA = 1, - FCNVME_RS_RCTL_XFER_RDY = 5, - FCNVME_RS_RCTL_RSP = 8, + FCNVME_RS_RCTL_CMND = 0x6, + FCNVME_RS_RCTL_DATA = 0x1, + FCNVME_RS_RCTL_CONF = 0x3, + FCNVME_RS_RCTL_SR = 0x9, + FCNVME_RS_RCTL_XFER_RDY = 0x5, + FCNVME_RS_RCTL_RSP = 0x7, + FCNVME_RS_RCTL_ERSP = 0x8, + FCNVME_RS_RCTL_SR_RSP = 0xA, }; @@ -264,7 +338,10 @@ struct fcnvme_ls_acc_hdr { struct fcnvme_ls_rqst_w0 w0; __be32 desc_list_len; struct fcnvme_lsdesc_rqst rqst; - /* Followed by cmd-specific ACC descriptors, see next definitions */ + /* + * Followed by cmd-specific ACCEPT descriptors, see xxx_acc + * definitions below + */ }; /* FCNVME_LS_CREATE_ASSOCIATION */ @@ -302,25 +379,39 @@ struct fcnvme_ls_cr_conn_acc { struct fcnvme_lsdesc_conn_id connectid; }; -/* FCNVME_LS_DISCONNECT */ -struct fcnvme_ls_disconnect_rqst { +/* FCNVME_LS_DISCONNECT_ASSOC */ +struct fcnvme_ls_disconnect_assoc_rqst { struct fcnvme_ls_rqst_w0 w0; __be32 desc_list_len; struct fcnvme_lsdesc_assoc_id associd; struct fcnvme_lsdesc_disconn_cmd discon_cmd; }; -struct fcnvme_ls_disconnect_acc { +struct fcnvme_ls_disconnect_assoc_acc { + struct fcnvme_ls_acc_hdr hdr; +}; + + +/* FCNVME_LS_DISCONNECT_CONN */ +struct fcnvme_ls_disconnect_conn_rqst { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_assoc_id associd; + struct fcnvme_lsdesc_disconn_cmd connectid; +}; + +struct fcnvme_ls_disconnect_conn_acc { struct fcnvme_ls_acc_hdr hdr; }; /* - * Yet to be defined in FC-NVME: + * Default R_A_TOV is pulled in from fc_fs.h but needs conversion + * from ms to seconds for our use. */ -#define NVME_FC_CONNECT_TIMEOUT_SEC 2 /* 2 seconds */ -#define NVME_FC_LS_TIMEOUT_SEC 2 /* 2 seconds */ -#define NVME_FC_TGTOP_TIMEOUT_SEC 2 /* 2 seconds */ +#define FC_TWO_TIMES_R_A_TOV (2 * (FC_DEF_R_A_TOV / 1000)) +#define NVME_FC_LS_TIMEOUT_SEC FC_TWO_TIMES_R_A_TOV +#define NVME_FC_TGTOP_TIMEOUT_SEC FC_TWO_TIMES_R_A_TOV /* * TRADDR string must be of form "nn-<16hexdigits>:pn-<16hexdigits>" @@ -328,6 +419,7 @@ struct fcnvme_ls_disconnect_acc { * infront of the <16hexdigits>. Without is considered the "min" string * and with is considered the "max" string. The hexdigits may be upper * or lower case. + * Note: FC-NVME-2 standard requires a "0x" prefix. */ #define NVME_FC_TRADDR_NNLEN 3 /* "?n-" */ #define NVME_FC_TRADDR_OXNNLEN 5 /* "?n-0x" */ -- cgit v1.2.3 From 2dc3947b53f573e8a75ea9cbec5588df88ca502e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 13 Oct 2019 19:57:35 +0300 Subject: nvme: introduce "Command Aborted By host" status code Fix the status code of canceled requests initiated by the host according to TP4028 (Status Code 0x371): "Command Aborted By host: The command was aborted as a result of host action (e.g., the host disconnected the Fabric connection)." Also in a multipath environment, unless otherwise specified, errors of this type (path related) should be retried using a different path, if one is available. Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f61d6906e59d..a260cd754f28 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1368,6 +1368,7 @@ enum { NVME_SC_ANA_INACCESSIBLE = 0x302, NVME_SC_ANA_TRANSITION = 0x303, NVME_SC_HOST_PATH_ERROR = 0x370, + NVME_SC_HOST_ABORTED_CMD = 0x371, NVME_SC_CRD = 0x1800, NVME_SC_DNR = 0x4000, -- cgit v1.2.3 From 48c9e85b23464a7d1e3ebd70b79cc3a2d97d3222 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Mon, 14 Oct 2019 11:16:07 -0600 Subject: nvme: resync include/linux/nvme.h with nvmecli Update enumerations and structures in include/linux/nvme.h to resync with the nvmecli. All the updates are mentioned in the ratified NVMe 1.4 spec https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf Reviewed-by: Christoph Hellwig Signed-off-by: Revanth Rajashekar Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a260cd754f28..3eca4f7d8510 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -107,8 +107,22 @@ enum { NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */ NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */ NVME_REG_ACQ = 0x0030, /* Admin CQ Base Address */ - NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ + NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ + NVME_REG_BPINFO = 0x0040, /* Boot Partition Information */ + NVME_REG_BPRSEL = 0x0044, /* Boot Partition Read Select */ + NVME_REG_BPMBL = 0x0048, /* Boot Partition Memory Buffer + * Location + */ + NVME_REG_PMRCAP = 0x0e00, /* Persistent Memory Capabilities */ + NVME_REG_PMRCTL = 0x0e04, /* Persistent Memory Region Control */ + NVME_REG_PMRSTS = 0x0e08, /* Persistent Memory Region Status */ + NVME_REG_PMREBS = 0x0e0c, /* Persistent Memory Region Elasticity + * Buffer Size + */ + NVME_REG_PMRSWTP = 0x0e10, /* Persistent Memory Region Sustained + * Write Throughput + */ NVME_REG_DBS = 0x1000, /* SQ 0 Tail Doorbell */ }; @@ -295,6 +309,14 @@ enum { NVME_CTRL_OACS_DIRECTIVES = 1 << 5, NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1, + NVME_CTRL_CTRATT_128_ID = 1 << 0, + NVME_CTRL_CTRATT_NON_OP_PSP = 1 << 1, + NVME_CTRL_CTRATT_NVM_SETS = 1 << 2, + NVME_CTRL_CTRATT_READ_RECV_LVLS = 1 << 3, + NVME_CTRL_CTRATT_ENDURANCE_GROUPS = 1 << 4, + NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, + NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7, + NVME_CTRL_CTRATT_UUID_LIST = 1 << 9, }; struct nvme_lbaf { @@ -352,6 +374,9 @@ enum { NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, NVME_ID_CNS_CTRL_LIST = 0x13, + NVME_ID_CNS_SCNDRY_CTRL_LIST = 0x15, + NVME_ID_CNS_NS_GRANULARITY = 0x16, + NVME_ID_CNS_UUID_LIST = 0x17, }; enum { @@ -409,7 +434,8 @@ struct nvme_smart_log { __u8 avail_spare; __u8 spare_thresh; __u8 percent_used; - __u8 rsvd6[26]; + __u8 endu_grp_crit_warn_sumry; + __u8 rsvd7[25]; __u8 data_units_read[16]; __u8 data_units_written[16]; __u8 host_reads[16]; @@ -423,7 +449,11 @@ struct nvme_smart_log { __le32 warning_temp_time; __le32 critical_comp_time; __le16 temp_sensor[8]; - __u8 rsvd216[296]; + __le32 thm_temp1_trans_count; + __le32 thm_temp2_trans_count; + __le32 thm_temp1_total_time; + __le32 thm_temp2_total_time; + __u8 rsvd232[280]; }; struct nvme_fw_slot_info_log { @@ -440,6 +470,7 @@ enum { NVME_CMD_EFFECTS_NIC = 1 << 3, NVME_CMD_EFFECTS_CCC = 1 << 4, NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, + NVME_CMD_EFFECTS_UUID_SEL = 1 << 19, }; struct nvme_effects_log { @@ -563,6 +594,7 @@ enum nvme_opcode { nvme_cmd_compare = 0x05, nvme_cmd_write_zeroes = 0x08, nvme_cmd_dsm = 0x09, + nvme_cmd_verify = 0x0c, nvme_cmd_resv_register = 0x0d, nvme_cmd_resv_report = 0x0e, nvme_cmd_resv_acquire = 0x11, @@ -806,10 +838,14 @@ enum nvme_admin_opcode { nvme_admin_ns_mgmt = 0x0d, nvme_admin_activate_fw = 0x10, nvme_admin_download_fw = 0x11, + nvme_admin_dev_self_test = 0x14, nvme_admin_ns_attach = 0x15, nvme_admin_keep_alive = 0x18, nvme_admin_directive_send = 0x19, nvme_admin_directive_recv = 0x1a, + nvme_admin_virtual_mgmt = 0x1c, + nvme_admin_nvme_mi_send = 0x1d, + nvme_admin_nvme_mi_recv = 0x1e, nvme_admin_dbbuf = 0x7C, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, @@ -873,6 +909,7 @@ enum { NVME_FEAT_PLM_CONFIG = 0x13, NVME_FEAT_PLM_WINDOW = 0x14, NVME_FEAT_HOST_BEHAVIOR = 0x16, + NVME_FEAT_SANITIZE = 0x17, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, @@ -883,6 +920,10 @@ enum { NVME_LOG_FW_SLOT = 0x03, NVME_LOG_CHANGED_NS = 0x04, NVME_LOG_CMD_EFFECTS = 0x05, + NVME_LOG_DEVICE_SELF_TEST = 0x06, + NVME_LOG_TELEMETRY_HOST = 0x07, + NVME_LOG_TELEMETRY_CTRL = 0x08, + NVME_LOG_ENDURANCE_GROUP = 0x09, NVME_LOG_ANA = 0x0c, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, @@ -1290,7 +1331,11 @@ enum { NVME_SC_SGL_INVALID_OFFSET = 0x16, NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + NVME_SC_SANITIZE_FAILED = 0x1C, + NVME_SC_SANITIZE_IN_PROGRESS = 0x1D, + NVME_SC_NS_WRITE_PROTECTED = 0x20, + NVME_SC_CMD_INTERRUPTED = 0x21, NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, @@ -1328,6 +1373,8 @@ enum { NVME_SC_NS_NOT_ATTACHED = 0x11a, NVME_SC_THIN_PROV_NOT_SUPP = 0x11b, NVME_SC_CTRL_LIST_INVALID = 0x11c, + NVME_SC_BP_WRITE_PROHIBITED = 0x11e, + NVME_SC_PMR_SAN_PROHIBITED = 0x123, /* * I/O Command Set Specific - NVM commands: -- cgit v1.2.3 From 0c65b2b90d13c1deaee6449304dd367c5d4eb8ae Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 4 Nov 2019 02:40:33 +0100 Subject: net: of_get_phy_mode: Change API to solve int/unit warnings Before this change of_get_phy_mode() returned an enum, phy_interface_t. On error, -ENODEV etc, is returned. If the result of the function is stored in a variable of type phy_interface_t, and the compiler has decided to represent this as an unsigned int, comparision with -ENODEV etc, is a signed vs unsigned comparision. Fix this problem by changing the API. Make the function return an error, or 0 on success, and pass a pointer, of type phy_interface_t, where the phy mode should be stored. v2: Return with *interface set to PHY_INTERFACE_MODE_NA on error. Add error checks to all users of of_get_phy_mode() Fixup a few reverse christmas tree errors Fixup a few slightly malformed reverse christmas trees v3: Fix 0-day reported errors. Reported-by: Dan Carpenter Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/of_net.h | 7 +++++-- include/linux/stmmac.h | 3 ++- include/linux/sxgbe_platform.h | 4 +++- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/of_net.h b/include/linux/of_net.h index 6aeaea1775e6..71bbfcf3adcd 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -6,15 +6,18 @@ #ifndef __LINUX_OF_NET_H #define __LINUX_OF_NET_H +#include + #ifdef CONFIG_OF_NET #include struct net_device; -extern int of_get_phy_mode(struct device_node *np); +extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface); extern const void *of_get_mac_address(struct device_node *np); extern struct net_device *of_find_net_device_by_node(struct device_node *np); #else -static inline int of_get_phy_mode(struct device_node *np) +static inline int of_get_phy_mode(struct device_node *np, + phy_interface_t *interface) { return -ENODEV; } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 86f9464c3f5d..d4bcd9387136 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -13,6 +13,7 @@ #define __STMMAC_PLATFORM_DATA #include +#include #define MTL_MAX_RX_QUEUES 8 #define MTL_MAX_TX_QUEUES 8 @@ -132,7 +133,7 @@ struct plat_stmmacenet_data { int bus_id; int phy_addr; int interface; - int phy_interface; + phy_interface_t phy_interface; struct stmmac_mdio_bus_data *mdio_bus_data; struct device_node *phy_node; struct device_node *phylink_node; diff --git a/include/linux/sxgbe_platform.h b/include/linux/sxgbe_platform.h index 267369110584..85ec745767bd 100644 --- a/include/linux/sxgbe_platform.h +++ b/include/linux/sxgbe_platform.h @@ -10,6 +10,8 @@ #ifndef __SXGBE_PLATFORM_H__ #define __SXGBE_PLATFORM_H__ +#include + /* MDC Clock Selection define*/ #define SXGBE_CSR_100_150M 0x0 /* MDC = clk_scr_i/62 */ #define SXGBE_CSR_150_250M 0x1 /* MDC = clk_scr_i/102 */ @@ -38,7 +40,7 @@ struct sxgbe_plat_data { char *phy_bus_name; int bus_id; int phy_addr; - int interface; + phy_interface_t interface; struct sxgbe_mdio_bus_data *mdio_bus_data; struct sxgbe_dma_cfg *dma_cfg; int clk_csr; -- cgit v1.2.3 From b6520fce073b619e6f2c0d510bb3481c9386c70b Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Thu, 26 Sep 2019 12:06:45 +0200 Subject: netfilter: ipset: Add wildcard support to net,iface The net,iface equal functions currently compares the full interface names. In several cases, wildcard (or prefix) matching is useful. For example, when converting a large iptables rule-set to make use of ipset, I was able to significantly reduce the number of set elements by making use of wildcard matching. Wildcard matching is enabled by adding "wildcard" when adding an element to a set. Internally, this causes the IPSET_FLAG_IFACE_WILDCARD-flag to be set. When this flag is set, only the initial part of the interface name is used for comparison. Wildcard matching is done per element and not per set, as there are many cases where mixing wildcard and non-wildcard elements are useful. This means that is up to the user to handle (avoid) overlapping interface names. Signed-off-by: Kristian Evensen Signed-off-by: Jozsef Kadlecsik --- include/uapi/linux/netfilter/ipset/ip_set.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index eea166c52c36..11a72a938eb1 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -205,6 +205,8 @@ enum ipset_cadt_flags { IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD), IPSET_FLAG_BIT_WITH_SKBINFO = 6, IPSET_FLAG_WITH_SKBINFO = (1 << IPSET_FLAG_BIT_WITH_SKBINFO), + IPSET_FLAG_BIT_IFACE_WILDCARD = 7, + IPSET_FLAG_IFACE_WILDCARD = (1 << IPSET_FLAG_BIT_IFACE_WILDCARD), IPSET_FLAG_CADT_MAX = 15, }; -- cgit v1.2.3 From 8adeac3be03d400f9c2391d52f85cd27bd188800 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 2 Oct 2019 14:03:41 -0500 Subject: dm stripe: use struct_size() in kmalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct stripe_c { ... struct stripe stripe[0]; }; In this case alloc_context() and dm_array_too_big() are removed and replaced by the direct use of the struct_size() helper in kmalloc(). Notice that open-coded form is prone to type mistakes. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Mike Snitzer --- include/linux/device-mapper.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 399ad8632356..2e13826898b2 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -594,9 +594,6 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); */ #define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz)) -#define dm_array_too_big(fixed, obj, num) \ - ((num) > (UINT_MAX - (fixed)) / (obj)) - /* * Sector offset taken relative to the start of the target instead of * relative to the start of the device. -- cgit v1.2.3 From 15122464d525f684a61806d28597050cdcef0f32 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Sat, 2 Nov 2019 01:12:03 +0100 Subject: icmp: add helpers to recognize ICMP error packets Add two helper functions, one for IPv4 and one for IPv6, to recognize the ICMP packets which are error responses. This packets are special because they have as payload the original header of the packet which generated it (RFC 792 says at least 8 bytes, but Linux actually includes much more than that). Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- include/linux/icmp.h | 15 +++++++++++++++ include/linux/icmpv6.h | 14 ++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/icmp.h b/include/linux/icmp.h index 2d8aaf7d4b9e..81ca84ce3119 100644 --- a/include/linux/icmp.h +++ b/include/linux/icmp.h @@ -20,4 +20,19 @@ static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) { return (struct icmphdr *)skb_transport_header(skb); } + +static inline bool icmp_is_err(int type) +{ + switch (type) { + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_REDIRECT: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + return true; + } + + return false; +} + #endif /* _LINUX_ICMP_H */ diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index a8f888976137..ef1cbb5f454f 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -46,4 +46,18 @@ extern void icmpv6_flow_init(struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, int oif); + +static inline bool icmpv6_is_err(int type) +{ + switch (type) { + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + case ICMPV6_PARAMPROB: + return true; + } + + return false; +} + #endif -- cgit v1.2.3 From 5cd73fbd78794d9c9c4e7a61dc8fa83489b43d03 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 5 Nov 2019 01:12:57 +0100 Subject: net: dsa: Add support for devlink resources Add wrappers around the devlink resource API, so that DSA drivers can register and unregister devlink resources. Signed-off-by: Andrew Lunn Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/dsa.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index e4c697b95c70..9507611a41f0 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -586,6 +586,22 @@ int dsa_devlink_params_register(struct dsa_switch *ds, void dsa_devlink_params_unregister(struct dsa_switch *ds, const struct devlink_param *params, size_t params_count); +int dsa_devlink_resource_register(struct dsa_switch *ds, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params); + +void dsa_devlink_resources_unregister(struct dsa_switch *ds); + +void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv); +void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, + u64 resource_id); + struct dsa_devlink_priv { struct dsa_switch *ds; }; -- cgit v1.2.3 From 4d390c287b2f3fbd0bb64c52c1a9418f790986e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:13 -0800 Subject: net_sched: do not export gnet_stats_basic_packed to uapi gnet_stats_basic_packed was really meant to be private kernel structure. If this proves to be a problem, we will have to rename the in-kernel version. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/gen_stats.h | 6 ++++++ include/uapi/linux/gen_stats.h | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index ca23860adbb9..5f3889e7ec1b 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -7,6 +7,12 @@ #include #include +/* Note: this used to be in include/uapi/linux/gen_stats.h */ +struct gnet_stats_basic_packed { + __u64 bytes; + __u32 packets; +} __attribute__ ((packed)); + struct gnet_stats_basic_cpu { struct gnet_stats_basic_packed bstats; struct u64_stats_sync syncp; diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 065408e16a80..4eaacdf452e3 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -26,10 +26,6 @@ struct gnet_stats_basic { __u64 bytes; __u32 packets; }; -struct gnet_stats_basic_packed { - __u64 bytes; - __u32 packets; -} __attribute__ ((packed)); /** * struct gnet_stats_rate_est - rate estimator -- cgit v1.2.3 From d0083d98f685b9f4fe810570f93cef0b0bb6b354 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:14 -0800 Subject: net_sched: extend packet counter to 64bit After this change, qdisc packet counter is no longer a 32bit quantity. We still export 32bit values to user. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/gen_stats.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 5f3889e7ec1b..1424e02cef90 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -10,8 +10,8 @@ /* Note: this used to be in include/uapi/linux/gen_stats.h */ struct gnet_stats_basic_packed { __u64 bytes; - __u32 packets; -} __attribute__ ((packed)); + __u64 packets; +}; struct gnet_stats_basic_cpu { struct gnet_stats_basic_packed bstats; -- cgit v1.2.3 From b33e699fe43aa63f29113311f69357e119ef5276 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:15 -0800 Subject: net_sched: add TCA_STATS_PKT64 attribute Now the kernel uses 64bit packet counters in scheduler layer, we want to export these counters to user space. Instead risking breaking user space by adding fields to struct gnet_stats_basic, add a new TCA_STATS_PKT64. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/gen_stats.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 4eaacdf452e3..852f234f1fd6 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -13,6 +13,7 @@ enum { TCA_STATS_RATE_EST64, TCA_STATS_PAD, TCA_STATS_BASIC_HW, + TCA_STATS_PKT64, __TCA_STATS_MAX, }; #define TCA_STATS_MAX (__TCA_STATS_MAX - 1) -- cgit v1.2.3 From fbf6c73c5b264c25484fa9f449b5546569fe11f0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 16 Oct 2019 17:51:10 +0100 Subject: ftrace: add ftrace_init_nop() Architectures may need to perform special initialization of ftrace callsites, and today they do so by special-casing ftrace_make_nop() when the expected branch address is MCOUNT_ADDR. In some cases (e.g. for patchable-function-entry), we don't have an mcount-like symbol and don't want a synthetic MCOUNT_ADDR, but we may need to perform some initialization of callsites. To make it possible to separate initialization from runtime modification, and to handle cases without an mcount-like symbol, this patch adds an optional ftrace_init_nop() function that architectures can implement, which does not pass a branch address. Where an architecture does not provide ftrace_init_nop(), we will fall back to the existing behaviour of calling ftrace_make_nop() with MCOUNT_ADDR. At the same time, ftrace_code_disable() is renamed to ftrace_nop_initialize() to make it clearer that it is intended to intialize a callsite into a disabled state, and is not for disabling a callsite that has been runtime enabled. The kerneldoc description of rec arguments is updated to cover non-mcount callsites. Signed-off-by: Mark Rutland Reviewed-by: Amit Daniel Kachhap Reviewed-by: Ard Biesheuvel Reviewed-by: Miroslav Benes Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Torsten Duwe Tested-by: Amit Daniel Kachhap Tested-by: Sven Schnelle Tested-by: Torsten Duwe Cc: Ingo Molnar --- include/linux/ftrace.h | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8a8cb3c401b2..9867d90d635e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -499,7 +499,7 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; } /** * ftrace_make_nop - convert code into nop * @mod: module structure if called by module load initialization - * @rec: the mcount call site record + * @rec: the call site record (e.g. mcount/fentry) * @addr: the address that the call site should be calling * * This is a very sensitive operation and great care needs @@ -520,9 +520,38 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; } extern int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr); + +/** + * ftrace_init_nop - initialize a nop call site + * @mod: module structure if called by module load initialization + * @rec: the call site record (e.g. mcount/fentry) + * + * This is a very sensitive operation and great care needs + * to be taken by the arch. The operation should carefully + * read the location, check to see if what is read is indeed + * what we expect it to be, and then on success of the compare, + * it should write to the location. + * + * The code segment at @rec->ip should contain the contents created by + * the compiler + * + * Return must be: + * 0 on success + * -EFAULT on error reading the location + * -EINVAL on a failed compare of the contents + * -EPERM on error writing to the location + * Any other value will be considered a failure. + */ +#ifndef ftrace_init_nop +static inline int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) +{ + return ftrace_make_nop(mod, rec, MCOUNT_ADDR); +} +#endif + /** * ftrace_make_call - convert a nop call site into a call to addr - * @rec: the mcount call site record + * @rec: the call site record (e.g. mcount/fentry) * @addr: the address that the call site should call * * This is a very sensitive operation and great care needs @@ -545,7 +574,7 @@ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS /** * ftrace_modify_call - convert from one addr to another (no nop) - * @rec: the mcount call site record + * @rec: the call site record (e.g. mcount/fentry) * @old_addr: the address expected to be currently called to * @addr: the address to change to * -- cgit v1.2.3 From a1326b17ac03a9012cb3d01e434aacb4d67a416c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 16 Oct 2019 18:17:11 +0100 Subject: module/ftrace: handle patchable-function-entry When using patchable-function-entry, the compiler will record the callsites into a section named "__patchable_function_entries" rather than "__mcount_loc". Let's abstract this difference behind a new FTRACE_CALLSITE_SECTION, so that architectures don't have to handle this explicitly (e.g. with custom module linker scripts). As parisc currently handles this explicitly, it is fixed up accordingly, with its custom linker script removed. Since FTRACE_CALLSITE_SECTION is only defined when DYNAMIC_FTRACE is selected, the parisc module loading code is updated to only use the definition in that case. When DYNAMIC_FTRACE is not selected, modules shouldn't have this section, so this removes some redundant work in that case. To make sure that this is keep up-to-date for modules and the main kernel, a comment is added to vmlinux.lds.h, with the existing ifdeffery simplified for legibility. I built parisc generic-{32,64}bit_defconfig with DYNAMIC_FTRACE enabled, and verified that the section made it into the .ko files for modules. Signed-off-by: Mark Rutland Acked-by: Helge Deller Acked-by: Steven Rostedt (VMware) Reviewed-by: Ard Biesheuvel Reviewed-by: Torsten Duwe Tested-by: Amit Daniel Kachhap Tested-by: Sven Schnelle Tested-by: Torsten Duwe Cc: Ingo Molnar Cc: James E.J. Bottomley Cc: Jessica Yu Cc: linux-parisc@vger.kernel.org --- include/asm-generic/vmlinux.lds.h | 14 +++++++------- include/linux/ftrace.h | 5 +++++ 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index dae64600ccbf..a9c4e4721434 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -110,17 +110,17 @@ #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD -#ifdef CC_USING_PATCHABLE_FUNCTION_ENTRY -#define MCOUNT_REC() . = ALIGN(8); \ - __start_mcount_loc = .; \ - KEEP(*(__patchable_function_entries)) \ - __stop_mcount_loc = .; -#else +/* + * The ftrace call sites are logged to a section whose name depends on the + * compiler option used. A given kernel image will only use one, AKA + * FTRACE_CALLSITE_SECTION. We capture all of them here to avoid header + * dependencies for FTRACE_CALLSITE_SECTION's definition. + */ #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__mcount_loc)) \ + KEEP(*(__patchable_function_entries)) \ __stop_mcount_loc = .; -#endif #else #define MCOUNT_REC() #endif diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9867d90d635e..9141f2263286 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -738,6 +738,11 @@ static inline unsigned long get_lock_parent_ip(void) #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); +#ifdef CC_USING_PATCHABLE_FUNCTION_ENTRY +#define FTRACE_CALLSITE_SECTION "__patchable_function_entries" +#else +#define FTRACE_CALLSITE_SECTION "__mcount_loc" +#endif #else static inline void ftrace_init(void) { } #endif -- cgit v1.2.3 From b103fb7653fff09e7a6fb6ba9398a41584e7ae36 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 24 Oct 2019 14:54:36 -0700 Subject: fscrypt: add support for IV_INO_LBLK_64 policies Inline encryption hardware compliant with the UFS v2.1 standard or with the upcoming version of the eMMC standard has the following properties: (1) Per I/O request, the encryption key is specified by a previously loaded keyslot. There might be only a small number of keyslots. (2) Per I/O request, the starting IV is specified by a 64-bit "data unit number" (DUN). IV bits 64-127 are assumed to be 0. The hardware automatically increments the DUN for each "data unit" of configurable size in the request, e.g. for each filesystem block. Property (1) makes it inefficient to use the traditional fscrypt per-file keys. Property (2) precludes the use of the existing DIRECT_KEY fscrypt policy flag, which needs at least 192 IV bits. Therefore, add a new fscrypt policy flag IV_INO_LBLK_64 which causes the encryption to modified as follows: - The encryption keys are derived from the master key, encryption mode number, and filesystem UUID. - The IVs are chosen as (inode_number << 32) | file_logical_block_num. For filenames encryption, file_logical_block_num is 0. Since the file nonces aren't used in the key derivation, many files may share the same encryption key. This is much more efficient on the target hardware. Including the inode number in the IVs and mixing the filesystem UUID into the keys ensures that data in different files is nevertheless still encrypted differently. Additionally, limiting the inode and block numbers to 32 bits and placing the block number in the low bits maintains compatibility with the 64-bit DUN convention (property (2) above). Since this scheme assumes that inode numbers are stable (which may preclude filesystem shrinking) and that inode and file logical block numbers are at most 32-bit, IV_INO_LBLK_64 will only be allowed on filesystems that meet these constraints. These are acceptable limitations for the cases where this format would actually be used. Note that IV_INO_LBLK_64 is an on-disk format, not an implementation. This patch just adds support for it using the existing filesystem layer encryption. A later patch will add support for inline encryption. Reviewed-by: Paul Crowley Co-developed-by: Satya Tangirala Signed-off-by: Satya Tangirala Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 3 +++ include/uapi/linux/fscrypt.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 04f5ed628445..1a7bffe78ed5 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -61,6 +61,9 @@ struct fscrypt_operations { bool (*dummy_context)(struct inode *); bool (*empty_dir)(struct inode *); unsigned int max_namelen; + bool (*has_stable_inodes)(struct super_block *sb); + void (*get_ino_and_lblk_bits)(struct super_block *sb, + int *ino_bits_ret, int *lblk_bits_ret); }; static inline bool fscrypt_has_encryption_key(const struct inode *inode) diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h index 39ccfe9311c3..1beb174ad950 100644 --- a/include/uapi/linux/fscrypt.h +++ b/include/uapi/linux/fscrypt.h @@ -17,7 +17,8 @@ #define FSCRYPT_POLICY_FLAGS_PAD_32 0x03 #define FSCRYPT_POLICY_FLAGS_PAD_MASK 0x03 #define FSCRYPT_POLICY_FLAG_DIRECT_KEY 0x04 -#define FSCRYPT_POLICY_FLAGS_VALID 0x07 +#define FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 0x08 +#define FSCRYPT_POLICY_FLAGS_VALID 0x0F /* Encryption algorithms */ #define FSCRYPT_MODE_AES_256_XTS 1 -- cgit v1.2.3 From 1bb5ec2eec48dcab1d8ae3707e4a388da6a9c9dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 6 Nov 2019 12:49:57 -0800 Subject: cgroup: use cgroup->last_bstat instead of cgroup->bstat_pending for consistency cgroup->bstat_pending is used to determine the base stat delta to propagate to the parent. While correct, this is different from how percpu delta is determined for no good reason and the inconsistency makes the code more difficult to understand. This patch makes parent propagation delta calculation use the same method as percpu to global propagation. * cgroup_base_stat_accumulate() is renamed to cgroup_base_stat_add() and cgroup_base_stat_sub() is added. * percpu propagation calculation is updated to use the above helpers. * cgroup->bstat_pending is replaced with cgroup->last_bstat and updated to use the same calculation as percpu propagation. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 430e219e3aba..4904b1ebd1ff 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -458,7 +458,7 @@ struct cgroup { struct list_head rstat_css_list; /* cgroup basic resource statistics */ - struct cgroup_base_stat pending_bstat; /* pending from children */ + struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; struct prev_cputime prev_cputime; /* for printing out cputime */ -- cgit v1.2.3 From 25c7a6d1f90e208ec27ca854b1381ed39842ec57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2019 14:11:51 -0800 Subject: net: avoid potential false sharing in neighbor related code There are common instances of the following construct : if (n->confirmed != now) n->confirmed = now; A C compiler could legally remove the conditional. Use READ_ONCE()/WRITE_ONCE() to avoid this problem. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/arp.h | 4 ++-- include/net/ndisc.h | 8 ++++---- include/net/sock.h | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/arp.h b/include/net/arp.h index c8f580a0e6b1..4950191f6b2b 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -57,8 +57,8 @@ static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) unsigned long now = jiffies; /* avoid dirtying neighbour */ - if (n->confirmed != now) - n->confirmed = now; + if (READ_ONCE(n->confirmed) != now) + WRITE_ONCE(n->confirmed, now); } rcu_read_unlock_bh(); } diff --git a/include/net/ndisc.h b/include/net/ndisc.h index b2f715ca0567..b5ebeb3b0de0 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -414,8 +414,8 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev, unsigned long now = jiffies; /* avoid dirtying neighbour */ - if (n->confirmed != now) - n->confirmed = now; + if (READ_ONCE(n->confirmed) != now) + WRITE_ONCE(n->confirmed, now); } rcu_read_unlock_bh(); } @@ -431,8 +431,8 @@ static inline void __ipv6_confirm_neigh_stub(struct net_device *dev, unsigned long now = jiffies; /* avoid dirtying neighbour */ - if (n->confirmed != now) - n->confirmed = now; + if (READ_ONCE(n->confirmed) != now) + WRITE_ONCE(n->confirmed, now); } rcu_read_unlock_bh(); } diff --git a/include/net/sock.h b/include/net/sock.h index ac6042d0af32..f2f853439b65 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1939,8 +1939,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); static inline void sk_dst_confirm(struct sock *sk) { - if (!sk->sk_dst_pending_confirm) - sk->sk_dst_pending_confirm = 1; + if (!READ_ONCE(sk->sk_dst_pending_confirm)) + WRITE_ONCE(sk->sk_dst_pending_confirm, 1); } static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) @@ -1950,10 +1950,10 @@ static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) unsigned long now = jiffies; /* avoid dirtying neighbour */ - if (n->confirmed != now) - n->confirmed = now; - if (sk && sk->sk_dst_pending_confirm) - sk->sk_dst_pending_confirm = 0; + if (READ_ONCE(n->confirmed) != now) + WRITE_ONCE(n->confirmed, now); + if (sk && READ_ONCE(sk->sk_dst_pending_confirm)) + WRITE_ONCE(sk->sk_dst_pending_confirm, 0); } } -- cgit v1.2.3 From 288efe8606b62d0753ba6722b36ef241877251fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2019 14:11:53 -0800 Subject: net: annotate lockless accesses to sk->sk_ack_backlog sk->sk_ack_backlog can be read without any lock being held. We need to use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing and/or potential KCSAN warnings. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index f2f853439b65..a126784aa7d9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -859,17 +859,17 @@ static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) static inline void sk_acceptq_removed(struct sock *sk) { - sk->sk_ack_backlog--; + WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1); } static inline void sk_acceptq_added(struct sock *sk) { - sk->sk_ack_backlog++; + WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1); } static inline bool sk_acceptq_is_full(const struct sock *sk) { - return sk->sk_ack_backlog > sk->sk_max_ack_backlog; + return READ_ONCE(sk->sk_ack_backlog) > sk->sk_max_ack_backlog; } /* -- cgit v1.2.3 From 099ecf59f05b5f30f42ebac0ab8cb94f9b18c90c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2019 14:11:54 -0800 Subject: net: annotate lockless accesses to sk->sk_max_ack_backlog sk->sk_max_ack_backlog can be read without any lock being held at least in TCP/DCCP cases. We need to use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing and/or potential KCSAN warnings. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index a126784aa7d9..d4d3ef5ba049 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -869,7 +869,7 @@ static inline void sk_acceptq_added(struct sock *sk) static inline bool sk_acceptq_is_full(const struct sock *sk) { - return READ_ONCE(sk->sk_ack_backlog) > sk->sk_max_ack_backlog; + return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); } /* -- cgit v1.2.3 From 4ece477870774698e6e73d5821a3dd1605ca123b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:05 +0800 Subject: lwtunnel: add options setting and dumping for geneve To add options setting and dumping, .build_state(), .fill_encap() and .get_encap_size() in ip_tun_lwt_ops needs to be extended: ip_tun_build_state(): ip_tun_parse_opts(): ip_tun_parse_opts_geneve() ip_tun_fill_encap_info(): ip_tun_fill_encap_opts(): ip_tun_fill_encap_opts_geneve() ip_tun_encap_nlsize() ip_tun_opts_nlsize(): if (tun_flags & TUNNEL_GENEVE_OPT) ip_tun_parse_opts(), ip_tun_fill_encap_opts() and ip_tun_opts_nlsize() processes LWTUNNEL_IP_OPTS. ip_tun_parse_opts_geneve(), ip_tun_fill_encap_opts_geneve() and if (tun_flags & TUNNEL_GENEVE_OPT) processes LWTUNNEL_IP_OPTS_GENEVE. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index de696ca12f2c..b595ab219036 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -27,6 +27,7 @@ enum lwtunnel_ip_t { LWTUNNEL_IP_TOS, LWTUNNEL_IP_FLAGS, LWTUNNEL_IP_PAD, + LWTUNNEL_IP_OPTS, __LWTUNNEL_IP_MAX, }; @@ -41,11 +42,30 @@ enum lwtunnel_ip6_t { LWTUNNEL_IP6_TC, LWTUNNEL_IP6_FLAGS, LWTUNNEL_IP6_PAD, + LWTUNNEL_IP6_OPTS, __LWTUNNEL_IP6_MAX, }; #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1) +enum { + LWTUNNEL_IP_OPTS_UNSPEC, + LWTUNNEL_IP_OPTS_GENEVE, + __LWTUNNEL_IP_OPTS_MAX, +}; + +#define LWTUNNEL_IP_OPTS_MAX (__LWTUNNEL_IP_OPTS_MAX - 1) + +enum { + LWTUNNEL_IP_OPT_GENEVE_UNSPEC, + LWTUNNEL_IP_OPT_GENEVE_CLASS, + LWTUNNEL_IP_OPT_GENEVE_TYPE, + LWTUNNEL_IP_OPT_GENEVE_DATA, + __LWTUNNEL_IP_OPT_GENEVE_MAX, +}; + +#define LWTUNNEL_IP_OPT_GENEVE_MAX (__LWTUNNEL_IP_OPT_GENEVE_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From edf31cbb1502481da181a09148adb33e12599185 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:06 +0800 Subject: lwtunnel: add options setting and dumping for vxlan Based on the code framework built on the last patch, to support setting and dumping for vxlan, we only need to add ip_tun_parse_opts_vxlan() for .build_state and ip_tun_fill_encap_opts_vxlan() for .fill_encap and if (tun_flags & TUNNEL_VXLAN_OPT) for .get_encap_size. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index b595ab219036..638b7b108453 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -51,6 +51,7 @@ enum lwtunnel_ip6_t { enum { LWTUNNEL_IP_OPTS_UNSPEC, LWTUNNEL_IP_OPTS_GENEVE, + LWTUNNEL_IP_OPTS_VXLAN, __LWTUNNEL_IP_OPTS_MAX, }; @@ -66,6 +67,14 @@ enum { #define LWTUNNEL_IP_OPT_GENEVE_MAX (__LWTUNNEL_IP_OPT_GENEVE_MAX - 1) +enum { + LWTUNNEL_IP_OPT_VXLAN_UNSPEC, + LWTUNNEL_IP_OPT_VXLAN_GBP, + __LWTUNNEL_IP_OPT_VXLAN_MAX, +}; + +#define LWTUNNEL_IP_OPT_VXLAN_MAX (__LWTUNNEL_IP_OPT_VXLAN_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From b0a21810bd5e1f92e3379899cc8ca9fe144ee8b3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:07 +0800 Subject: lwtunnel: add options setting and dumping for erspan Based on the code framework built on the last patch, to support setting and dumping for vxlan, we only need to add ip_tun_parse_opts_erspan() for .build_state and ip_tun_fill_encap_opts_erspan() for .fill_encap and if (tun_flags & TUNNEL_ERSPAN_OPT) for .get_encap_size. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 638b7b108453..f6035f737193 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -52,6 +52,7 @@ enum { LWTUNNEL_IP_OPTS_UNSPEC, LWTUNNEL_IP_OPTS_GENEVE, LWTUNNEL_IP_OPTS_VXLAN, + LWTUNNEL_IP_OPTS_ERSPAN, __LWTUNNEL_IP_OPTS_MAX, }; @@ -75,6 +76,17 @@ enum { #define LWTUNNEL_IP_OPT_VXLAN_MAX (__LWTUNNEL_IP_OPT_VXLAN_MAX - 1) +enum { + LWTUNNEL_IP_OPT_ERSPAN_UNSPEC, + LWTUNNEL_IP_OPT_ERSPAN_VER, + LWTUNNEL_IP_OPT_ERSPAN_INDEX, + LWTUNNEL_IP_OPT_ERSPAN_DIR, + LWTUNNEL_IP_OPT_ERSPAN_HWID, + __LWTUNNEL_IP_OPT_ERSPAN_MAX, +}; + +#define LWTUNNEL_IP_OPT_ERSPAN_MAX (__LWTUNNEL_IP_OPT_ERSPAN_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From 9ed498c6280a2f2b51d02df96df53037272ede49 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Nov 2019 10:04:11 -0800 Subject: net: silence data-races on sk_backlog.tail sk->sk_backlog.tail might be read without holding the socket spinlock, we need to add proper READ_ONCE()/WRITE_ONCE() to silence the warnings. KCSAN reported : BUG: KCSAN: data-race in tcp_add_backlog / tcp_recvmsg write to 0xffff8881265109f8 of 8 bytes by interrupt on cpu 1: __sk_add_backlog include/net/sock.h:907 [inline] sk_add_backlog include/net/sock.h:938 [inline] tcp_add_backlog+0x476/0xce0 net/ipv4/tcp_ipv4.c:1759 tcp_v4_rcv+0x1a70/0x1bd0 net/ipv4/tcp_ipv4.c:1947 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:4929 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5043 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5133 napi_skb_finish net/core/dev.c:5596 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5629 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 virtnet_receive drivers/net/virtio_net.c:1323 [inline] virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428 napi_poll net/core/dev.c:6311 [inline] net_rx_action+0x3ae/0xa90 net/core/dev.c:6379 __do_softirq+0x115/0x33f kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0xbb/0xe0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] do_IRQ+0xa6/0x180 arch/x86/kernel/irq.c:263 ret_from_intr+0x0/0x19 native_safe_halt+0xe/0x10 arch/x86/kernel/paravirt.c:71 arch_cpu_idle+0x1f/0x30 arch/x86/kernel/process.c:571 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x1af/0x280 kernel/sched/idle.c:263 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355 start_secondary+0x208/0x260 arch/x86/kernel/smpboot.c:264 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241 read to 0xffff8881265109f8 of 8 bytes by task 8057 on cpu 0: tcp_recvmsg+0x46e/0x1b40 net/ipv4/tcp.c:2050 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1889 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 __vfs_read+0xb1/0xc0 fs/read_write.c:427 vfs_read fs/read_write.c:461 [inline] vfs_read+0x143/0x2c0 fs/read_write.c:446 ksys_read+0xd5/0x1b0 fs/read_write.c:587 __do_sys_read fs/read_write.c:597 [inline] __se_sys_read fs/read_write.c:595 [inline] __x64_sys_read+0x4c/0x60 fs/read_write.c:595 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 8057 Comm: syz-fuzzer Not tainted 5.4.0-rc6+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index d4d3ef5ba049..bd210c78dc9d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -899,11 +899,11 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) skb_dst_force(skb); if (!sk->sk_backlog.tail) - sk->sk_backlog.head = skb; + WRITE_ONCE(sk->sk_backlog.head, skb); else sk->sk_backlog.tail->next = skb; - sk->sk_backlog.tail = skb; + WRITE_ONCE(sk->sk_backlog.tail, skb); skb->next = NULL; } -- cgit v1.2.3 From 2c63221cd9e5c0dad0424029aeb1c40faada8330 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Wed, 6 Nov 2019 23:36:13 +0100 Subject: dt-bindings: net: phy: Add support for AT803X Document the Atheros AR803x PHY bindings. Signed-off-by: Michael Walle Reviewed-by: Florian Fainelli Reviewed-by: Rob Herring Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/dt-bindings/net/qca-ar803x.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 include/dt-bindings/net/qca-ar803x.h (limited to 'include') diff --git a/include/dt-bindings/net/qca-ar803x.h b/include/dt-bindings/net/qca-ar803x.h new file mode 100644 index 000000000000..9c046c7242ed --- /dev/null +++ b/include/dt-bindings/net/qca-ar803x.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Device Tree constants for the Qualcomm Atheros AR803x PHYs + */ + +#ifndef _DT_BINDINGS_QCA_AR803X_H +#define _DT_BINDINGS_QCA_AR803X_H + +#define AR803X_STRENGTH_FULL 0 +#define AR803X_STRENGTH_HALF 1 +#define AR803X_STRENGTH_QUARTER 2 + +#endif -- cgit v1.2.3 From 99c4f70df3a6446c56ca817c2d0f9c12d85d4e7c Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Wed, 6 Nov 2019 18:31:24 +0100 Subject: regulator: ab8500: Remove AB8505 USB regulator The USB regulator was removed for AB8500 in commit 41a06aa738ad ("regulator: ab8500: Remove USB regulator"). It was then added for AB8505 in commit 547f384f33db ("regulator: ab8500: add support for ab8505"). However, there was never an entry added for it in ab8505_regulator_match. This causes all regulators after it to be initialized with the wrong device tree data, eventually leading to an out-of-bounds array read. Given that it is not used anywhere in the kernel, it seems likely that similar arguments against supporting it exist for AB8505 (it is controlled by hardware). Therefore, simply remove it like for AB8500 instead of adding an entry in ab8505_regulator_match. Fixes: 547f384f33db ("regulator: ab8500: add support for ab8505") Cc: Linus Walleij Signed-off-by: Stephan Gerhold Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20191106173125.14496-1-stephan@gerhold.net Signed-off-by: Mark Brown --- include/linux/regulator/ab8500.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h index 7cf8f797e13a..505e94a6e3e8 100644 --- a/include/linux/regulator/ab8500.h +++ b/include/linux/regulator/ab8500.h @@ -37,7 +37,6 @@ enum ab8505_regulator_id { AB8505_LDO_AUX6, AB8505_LDO_INTCORE, AB8505_LDO_ADC, - AB8505_LDO_USB, AB8505_LDO_AUDIO, AB8505_LDO_ANAMIC1, AB8505_LDO_ANAMIC2, -- cgit v1.2.3 From 458ea3ad033fc86e291712ce50cbe60c3428cf30 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Wed, 6 Nov 2019 18:31:25 +0100 Subject: regulator: ab8500: Remove SYSCLKREQ from enum ab8505_regulator_id Those regulators are not actually supported by the AB8500 regulator driver. There is no ab8500_regulator_info for them and no entry in ab8505_regulator_match. As such, they cannot be registered successfully, and looking them up in ab8505_regulator_match causes an out-of-bounds array read. Fixes: 547f384f33db ("regulator: ab8500: add support for ab8505") Cc: Linus Walleij Signed-off-by: Stephan Gerhold Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20191106173125.14496-2-stephan@gerhold.net Signed-off-by: Mark Brown --- include/linux/regulator/ab8500.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h index 505e94a6e3e8..3ab1ddf151a2 100644 --- a/include/linux/regulator/ab8500.h +++ b/include/linux/regulator/ab8500.h @@ -42,8 +42,6 @@ enum ab8505_regulator_id { AB8505_LDO_ANAMIC2, AB8505_LDO_AUX8, AB8505_LDO_ANA, - AB8505_SYSCLKREQ_2, - AB8505_SYSCLKREQ_4, AB8505_NUM_REGULATORS, }; -- cgit v1.2.3 From 6c1b1da58f8c7a697a88ae35afeba196fc7b701e Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Sun, 27 Oct 2019 23:05:45 +0900 Subject: block: add zone open, close and finish operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zoned block devices (ZBC and ZAC devices) allow an explicit control over the condition (state) of zones. The operations allowed are: * Open a zone: Transition to open condition to indicate that a zone will actively be written * Close a zone: Transition to closed condition to release the drive resources used for writing to a zone * Finish a zone: Transition an open or closed zone to the full condition to prevent write operations To enable this control for in-kernel zoned block device users, define the new request operations REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH as well as the generic function blkdev_zone_mgmt() for submitting these operations on a range of zones. This results in blkdev_reset_zones() removal and replacement with this new zone magement function. Users of blkdev_reset_zones() (f2fs and dm-zoned) are updated accordingly. Contains contributions from Matias Bjorling, Hans Holmberg, Dmitry Fomichev, Keith Busch, Damien Le Moal and Christoph Hellwig. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Keith Busch Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 25 +++++++++++++++++++++++++ include/linux/blkdev.h | 5 +++-- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1e7eeec16458..23a2fd534817 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -290,6 +290,12 @@ enum req_opf { REQ_OP_ZONE_RESET_ALL = 8, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = 9, + /* Open a zone */ + REQ_OP_ZONE_OPEN = 10, + /* Close a zone */ + REQ_OP_ZONE_CLOSE = 11, + /* Transition a zone to full */ + REQ_OP_ZONE_FINISH = 12, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, @@ -417,6 +423,25 @@ static inline bool op_is_discard(unsigned int op) return (op & REQ_OP_MASK) == REQ_OP_DISCARD; } +/* + * Check if a bio or request operation is a zone management operation, with + * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case + * due to its different handling in the block layer and device response in + * case of command failure. + */ +static inline bool op_is_zone_mgmt(enum req_opf op) +{ + switch (op & REQ_OP_MASK) { + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + return true; + default: + return false; + } +} + static inline int op_stat_group(unsigned int op) { if (op_is_discard(op)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d4051acb92a1..9cfafff86f66 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -360,8 +360,9 @@ extern unsigned int blkdev_nr_zones(struct block_device *bdev); extern int blkdev_report_zones(struct block_device *bdev, sector_t sector, struct blk_zone *zones, unsigned int *nr_zones); -extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, - sector_t nr_sectors, gfp_t gfp_mask); +extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, + sector_t sectors, sector_t nr_sectors, + gfp_t gfp_mask); extern int blk_revalidate_disk_zones(struct gendisk *disk); extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, -- cgit v1.2.3 From e876df1fe0ad1b191284ee6ed2db7960bd322d00 Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Sun, 27 Oct 2019 23:05:46 +0900 Subject: block: add zone open, close and finish ioctl support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce three new ioctl commands BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE to allow applications to control the condition of zones on a zoned block device through the execution of the REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH operations. Contains contributions from Matias Bjorling, Hans Holmberg, Dmitry Fomichev, Keith Busch, Damien Le Moal and Christoph Hellwig. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Keith Busch Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 +++++----- include/uapi/linux/blkzoned.h | 17 ++++++++++++++--- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9cfafff86f66..6a4f7abbdcf7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -367,8 +367,8 @@ extern int blk_revalidate_disk_zones(struct gendisk *disk); extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); -extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); +extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ @@ -389,9 +389,9 @@ static inline int blkdev_report_zones_ioctl(struct block_device *bdev, return -ENOTTY; } -static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, - fmode_t mode, unsigned int cmd, - unsigned long arg) +static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) { return -ENOTTY; } diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 498eec813494..0cdef67135f0 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -120,9 +120,11 @@ struct blk_zone_report { }; /** - * struct blk_zone_range - BLKRESETZONE ioctl request - * @sector: starting sector of the first zone to issue reset write pointer - * @nr_sectors: Total number of sectors of 1 or more zones to reset + * struct blk_zone_range - BLKRESETZONE/BLKOPENZONE/ + * BLKCLOSEZONE/BLKFINISHZONE ioctl + * requests + * @sector: Starting sector of the first zone to operate on. + * @nr_sectors: Total number of sectors of all zones to operate on. */ struct blk_zone_range { __u64 sector; @@ -139,10 +141,19 @@ struct blk_zone_range { * sector range. The sector range must be zone aligned. * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors. * @BLKGETNRZONES: Get the total number of zones of the device. + * @BLKOPENZONE: Open the zones in the specified sector range. + * The 512 B sector range must be zone aligned. + * @BLKCLOSEZONE: Close the zones in the specified sector range. + * The 512 B sector range must be zone aligned. + * @BLKFINISHZONE: Mark the zones as full in the specified sector range. + * The 512 B sector range must be zone aligned. */ #define BLKREPORTZONE _IOWR(0x12, 130, struct blk_zone_report) #define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range) #define BLKGETZONESZ _IOR(0x12, 132, __u32) #define BLKGETNRZONES _IOR(0x12, 133, __u32) +#define BLKOPENZONE _IOW(0x12, 134, struct blk_zone_range) +#define BLKCLOSEZONE _IOW(0x12, 135, struct blk_zone_range) +#define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range) #endif /* _UAPI_BLKZONED_H */ -- cgit v1.2.3 From 2665abfd757fb35a241c6f0b1ebf620e3ffb36fb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Nov 2019 12:40:47 -0700 Subject: io_uring: add support for linked SQE timeouts While we have support for generic timeouts, we don't have a way to tie a timeout to a specific SQE. The generic timeouts simply trigger wakeups on the CQ ring. This adds support for IORING_OP_LINK_TIMEOUT. This command is only valid as a link to a previous command. The timeout specific can be either relative or absolute, following the same rules as IORING_OP_TIMEOUT. If the timeout triggers before the dependent command completes, it will attempt to cancel that command. Likewise, if the dependent command completes before the timeout triggers, it will cancel the timeout. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6877cf8894db..f1a118b01d18 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -72,6 +72,7 @@ struct io_uring_sqe { #define IORING_OP_TIMEOUT_REMOVE 12 #define IORING_OP_ACCEPT 13 #define IORING_OP_ASYNC_CANCEL 14 +#define IORING_OP_LINK_TIMEOUT 15 /* * sqe->fsync_flags -- cgit v1.2.3 From 1d7bb1d50fb4dc141c7431cc21fdd24ffcc83c76 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 6 Nov 2019 11:31:17 -0700 Subject: io_uring: add support for backlogged CQ ring Currently we drop completion events, if the CQ ring is full. That's fine for requests with bounded completion times, but it may make it harder or impossible to use io_uring with networked IO where request completion times are generally unbounded. Or with POLL, for example, which is also unbounded. After this patch, we never overflow the ring, we simply store requests in a backlog for later flushing. This flushing is done automatically by the kernel. To prevent the backlog from growing indefinitely, if the backlog is non-empty, we apply back pressure on IO submissions. Any attempt to submit new IO with a non-empty backlog will get an -EBUSY return from the kernel. This is a signal to the application that it has backlogged CQ events, and that it must reap those before being allowed to submit more IO. Note that if we do return -EBUSY, we will have filled whatever backlogged events into the CQ ring first, if there's room. This means the application can safely reap events WITHOUT entering the kernel and waiting for them, they are already available in the CQ ring. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f1a118b01d18..2a1569211d87 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -155,6 +155,7 @@ struct io_uring_params { * io_uring_params->features flags */ #define IORING_FEAT_SINGLE_MMAP (1U << 0) +#define IORING_FEAT_NODROP (1U << 1) /* * io_uring_register(2) opcodes and arguments -- cgit v1.2.3 From 8a80d5d6638b7d58480a83aef49d587de63d4cbb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:02 -0800 Subject: blk-cgroup: remove now unused blkg_print_stat_{bytes|ios}_recursive() These don't have users anymore. Remove them. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index bed9e43f9426..914ce55fa8c2 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -220,11 +220,6 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, const struct blkg_rwstat_sample *rwstat); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -int blkg_print_stat_bytes(struct seq_file *sf, void *v); -int blkg_print_stat_ios(struct seq_file *sf, void *v); -int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v); -int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); - void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off, struct blkg_rwstat_sample *sum); -- cgit v1.2.3 From f73316482977ac401ac37245c9df48079d4e11f3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:03 -0800 Subject: blk-cgroup: reimplement basic IO stats using cgroup rstat blk-cgroup has been using blkg_rwstat to track basic IO stats. Unfortunately, reading recursive stats scales badly as itinvolves walking all descendants. On systems with a huge number of cgroups (dead or alive), this can lead to substantial CPU cost when reading IO stats. This patch reimplements basic IO stats using cgroup rstat which uses more memory but makes recursive stat reading O(# descendants which have been active since last reading) instead of O(# descendants). * blk-cgroup core no longer uses sync/async stats. Introduce new stat enums - BLKG_IOSTAT_{READ|WRITE|DISCARD}. * Add blkg_iostat[_set] which encapsulates byte and io stats, last values for propagation delta calculation and u64_stats_sync for correctness on 32bit archs. * Update the new percpu stat counters directly and implement blkcg_rstat_flush() to implement propagation. * blkg_print_stat() can now bring the stats up to date by calling cgroup_rstat_flush() and print them instead of directly summing up all descendants. * It now allocates 96 bytes per cpu. It used to be 40 bytes. Signed-off-by: Tejun Heo Cc: Dan Schatzberg Cc: Daniel Xu Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 48 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 914ce55fa8c2..867ab391e409 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -15,7 +15,9 @@ */ #include +#include #include +#include #include #include #include @@ -31,6 +33,14 @@ #ifdef CONFIG_BLK_CGROUP +enum blkg_iostat_type { + BLKG_IOSTAT_READ, + BLKG_IOSTAT_WRITE, + BLKG_IOSTAT_DISCARD, + + BLKG_IOSTAT_NR, +}; + enum blkg_rwstat_type { BLKG_RWSTAT_READ, BLKG_RWSTAT_WRITE, @@ -61,6 +71,17 @@ struct blkcg { #endif }; +struct blkg_iostat { + u64 bytes[BLKG_IOSTAT_NR]; + u64 ios[BLKG_IOSTAT_NR]; +}; + +struct blkg_iostat_set { + struct u64_stats_sync sync; + struct blkg_iostat cur; + struct blkg_iostat last; +}; + /* * blkg_[rw]stat->aux_cnt is excluded for local stats but included for * recursive. Used to carry stats of dead children. @@ -127,8 +148,8 @@ struct blkcg_gq { /* is this blkg online? protected by both blkcg and q locks */ bool online; - struct blkg_rwstat stat_bytes; - struct blkg_rwstat stat_ios; + struct blkg_iostat_set __percpu *iostat_cpu; + struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; @@ -740,15 +761,32 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, throtl = blk_throtl_bio(q, blkg, bio); if (!throtl) { + struct blkg_iostat_set *bis; + int rwd, cpu; + + if (op_is_discard(bio->bi_opf)) + rwd = BLKG_IOSTAT_DISCARD; + else if (op_is_write(bio->bi_opf)) + rwd = BLKG_IOSTAT_WRITE; + else + rwd = BLKG_IOSTAT_READ; + + cpu = get_cpu(); + bis = per_cpu_ptr(blkg->iostat_cpu, cpu); + u64_stats_update_begin(&bis->sync); + /* * If the bio is flagged with BIO_QUEUE_ENTERED it means this * is a split bio and we would have already accounted for the * size of the bio. */ if (!bio_flagged(bio, BIO_QUEUE_ENTERED)) - blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); - blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); + bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + bis->cur.ios[rwd]++; + + u64_stats_update_end(&bis->sync); + cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu); + put_cpu(); } blkcg_bio_issue_init(bio); -- cgit v1.2.3 From 1d156646e0d8ec390e5d5ac288137df02d4207be Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:04 -0800 Subject: blk-cgroup: separate out blkg_rwstat under CONFIG_BLK_CGROUP_RWSTAT blkg_rwstat is now only used by bfq-iosched and blk-throtl when on cgroup1. Let's move it into its own files and gate it behind a config option. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 159 --------------------------------------------- 1 file changed, 159 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 867ab391e409..48a66738143d 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -41,17 +41,6 @@ enum blkg_iostat_type { BLKG_IOSTAT_NR, }; -enum blkg_rwstat_type { - BLKG_RWSTAT_READ, - BLKG_RWSTAT_WRITE, - BLKG_RWSTAT_SYNC, - BLKG_RWSTAT_ASYNC, - BLKG_RWSTAT_DISCARD, - - BLKG_RWSTAT_NR, - BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, -}; - struct blkcg_gq; struct blkcg { @@ -82,19 +71,6 @@ struct blkg_iostat_set { struct blkg_iostat last; }; -/* - * blkg_[rw]stat->aux_cnt is excluded for local stats but included for - * recursive. Used to carry stats of dead children. - */ -struct blkg_rwstat { - struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; - atomic64_t aux_cnt[BLKG_RWSTAT_NR]; -}; - -struct blkg_rwstat_sample { - u64 cnt[BLKG_RWSTAT_NR]; -}; - /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track @@ -223,13 +199,6 @@ int blkcg_activate_policy(struct request_queue *q, void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol); -static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, - unsigned int idx) -{ - return atomic64_read(&rwstat->aux_cnt[idx]) + - percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); -} - const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, @@ -237,12 +206,6 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat_sample *rwstat); -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off); -void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, - int off, struct blkg_rwstat_sample *sum); struct blkg_conf_ctx { struct gendisk *disk; @@ -594,128 +557,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) -static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) -{ - int i, ret; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) { - ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); - if (ret) { - while (--i >= 0) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); - return ret; - } - atomic64_set(&rwstat->aux_cnt[i], 0); - } - return 0; -} - -static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); -} - -/** - * blkg_rwstat_add - add a value to a blkg_rwstat - * @rwstat: target blkg_rwstat - * @op: REQ_OP and flags - * @val: value to add - * - * Add @val to @rwstat. The counters are chosen according to @rw. The - * caller is responsible for synchronizing calls to this function. - */ -static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - unsigned int op, uint64_t val) -{ - struct percpu_counter *cnt; - - if (op_is_discard(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; - else if (op_is_write(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; - else - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; - - percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); - - if (op_is_sync(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; - else - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; - - percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); -} - -/** - * blkg_rwstat_read - read the current values of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Read the current snapshot of @rwstat and return it in the aux counts. - */ -static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, - struct blkg_rwstat_sample *result) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - result->cnt[i] = - percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); -} - -/** - * blkg_rwstat_total - read the total count of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Return the total count of @rwstat regardless of the IO direction. This - * function can be called without synchronization and takes care of u64 - * atomicity. - */ -static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) -{ - struct blkg_rwstat_sample tmp = { }; - - blkg_rwstat_read(rwstat, &tmp); - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; -} - -/** - * blkg_rwstat_reset - reset a blkg_rwstat - * @rwstat: blkg_rwstat to reset - */ -static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) { - percpu_counter_set(&rwstat->cpu_cnt[i], 0); - atomic64_set(&rwstat->aux_cnt[i], 0); - } -} - -/** - * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count - * @to: the destination blkg_rwstat - * @from: the source - * - * Add @from's count including the aux one to @to's aux count. - */ -static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, - struct blkg_rwstat *from) -{ - u64 sum[BLKG_RWSTAT_NR]; - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), - &to->aux_cnt[i]); -} - #ifdef CONFIG_BLK_DEV_THROTTLING extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio); -- cgit v1.2.3 From 200ecef67b8d09d16ec55f91c92751dcc7a38d40 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 11:51:18 -0800 Subject: tcp: Remove one extra ktime_get_ns() from cookie_init_timestamp tcp_make_synack() already uses tcp_clock_ns(), and can pass the value to cookie_init_timestamp() to avoid another call to ktime_get_ns() helper. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index ab4eb5eb5d07..36f195fb576a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -537,7 +537,7 @@ static inline u32 tcp_cookie_time(void) u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); -u64 cookie_init_timestamp(struct request_sock *req); +u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); bool cookie_ecn_ok(const struct tcp_options_received *opt, @@ -757,10 +757,16 @@ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); } +/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ +static inline u32 tcp_ns_to_ts(u64 ns) +{ + return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); +} + /* Could use tcp_clock_us() / 1000, but this version uses a single divide */ static inline u32 tcp_time_stamp_raw(void) { - return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ); + return tcp_ns_to_ts(tcp_clock_ns()); } void tcp_mstamp_refresh(struct tcp_sock *tp); @@ -772,7 +778,7 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { - return div_u64(skb->skb_mstamp_ns, NSEC_PER_SEC / TCP_TS_HZ); + return tcp_ns_to_ts(skb->skb_mstamp_ns); } /* provide the departure time in us unit */ -- cgit v1.2.3 From 6896cc4d8fe6fe6163d6f0baa02a270da68896e8 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 7 Nov 2019 18:42:09 +0200 Subject: devlink: Add layer 3 generic packet traps Add packet traps that can report packets that were dropped during layer 3 forwarding. Signed-off-by: Amit Cohen Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 6bf3b9e0595a..df7814d55bf9 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -569,6 +569,15 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_BLACKHOLE_ROUTE, DEVLINK_TRAP_GENERIC_ID_TTL_ERROR, DEVLINK_TRAP_GENERIC_ID_TAIL_DROP, + DEVLINK_TRAP_GENERIC_ID_NON_IP_PACKET, + DEVLINK_TRAP_GENERIC_ID_UC_DIP_MC_DMAC, + DEVLINK_TRAP_GENERIC_ID_DIP_LB, + DEVLINK_TRAP_GENERIC_ID_SIP_MC, + DEVLINK_TRAP_GENERIC_ID_SIP_LB, + DEVLINK_TRAP_GENERIC_ID_CORRUPTED_IP_HDR, + DEVLINK_TRAP_GENERIC_ID_IPV4_SIP_BC, + DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_RESERVED_SCOPE, + DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -607,6 +616,24 @@ enum devlink_trap_group_generic_id { "ttl_value_is_too_small" #define DEVLINK_TRAP_GENERIC_NAME_TAIL_DROP \ "tail_drop" +#define DEVLINK_TRAP_GENERIC_NAME_NON_IP_PACKET \ + "non_ip" +#define DEVLINK_TRAP_GENERIC_NAME_UC_DIP_MC_DMAC \ + "uc_dip_over_mc_dmac" +#define DEVLINK_TRAP_GENERIC_NAME_DIP_LB \ + "dip_is_loopback_address" +#define DEVLINK_TRAP_GENERIC_NAME_SIP_MC \ + "sip_is_mc" +#define DEVLINK_TRAP_GENERIC_NAME_SIP_LB \ + "sip_is_loopback_address" +#define DEVLINK_TRAP_GENERIC_NAME_CORRUPTED_IP_HDR \ + "ip_header_corrupted" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_SIP_BC \ + "ipv4_sip_is_limited_bc" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_RESERVED_SCOPE \ + "ipv6_mc_dip_reserved_scope" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE \ + "ipv6_mc_dip_interface_local_scope" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" -- cgit v1.2.3 From 3b063ae57bdfec5e574ace440e6c3f34c4115a92 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 7 Nov 2019 18:42:14 +0200 Subject: devlink: Add layer 3 generic packet exception traps Add layer 3 generic packet exception traps that can report trapped packets and documentation of the traps. Unlike drop traps, these exception traps also need to inject the packet to the kernel's receive path. For example, a packet that was trapped due to unreachable neighbour need to be injected into the kernel so that it will trigger an ARP request or a neighbour solicitation message. Signed-off-by: Amit Cohen Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index df7814d55bf9..8d6b5846822c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -578,6 +578,12 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_IPV4_SIP_BC, DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_RESERVED_SCOPE, DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, + DEVLINK_TRAP_GENERIC_ID_MTU_ERROR, + DEVLINK_TRAP_GENERIC_ID_UNRESOLVED_NEIGH, + DEVLINK_TRAP_GENERIC_ID_RPF, + DEVLINK_TRAP_GENERIC_ID_REJECT_ROUTE, + DEVLINK_TRAP_GENERIC_ID_IPV4_LPM_UNICAST_MISS, + DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -634,6 +640,18 @@ enum devlink_trap_group_generic_id { "ipv6_mc_dip_reserved_scope" #define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE \ "ipv6_mc_dip_interface_local_scope" +#define DEVLINK_TRAP_GENERIC_NAME_MTU_ERROR \ + "mtu_value_is_too_small" +#define DEVLINK_TRAP_GENERIC_NAME_UNRESOLVED_NEIGH \ + "unresolved_neigh" +#define DEVLINK_TRAP_GENERIC_NAME_RPF \ + "mc_reverse_path_forwarding" +#define DEVLINK_TRAP_GENERIC_NAME_REJECT_ROUTE \ + "reject_route" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_LPM_UNICAST_MISS \ + "ipv4_lpm_miss" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_LPM_UNICAST_MISS \ + "ipv6_lpm_miss" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" -- cgit v1.2.3 From de7d5084d82794a8e83afb994fcb07f82da3cd7b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 16:27:14 -0800 Subject: net: provide dev_lstats_read() helper Many network drivers use hand-coded implementation of the same thing, let's factorize things so that u64_stats_t adoption is done once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1f140a6b66df..75561992c31f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2401,6 +2401,8 @@ struct pcpu_lstats { struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); +void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes); + #define __netdev_alloc_pcpu_stats(type, gfp) \ ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ -- cgit v1.2.3 From dd5382a08157756510aa8d7269c662eccde775cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 16:27:15 -0800 Subject: net: provide dev_lstats_add() helper Many network drivers need it and hand-coded the same function. In order to ease u64_stats_t adoption, it is time to factorize. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 75561992c31f..461a36220cf4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2403,6 +2403,16 @@ struct pcpu_lstats { void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes); +static inline void dev_lstats_add(struct net_device *dev, unsigned int len) +{ + struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats); + + u64_stats_update_begin(&lstats->syncp); + lstats->bytes += len; + lstats->packets++; + u64_stats_update_end(&lstats->syncp); +} + #define __netdev_alloc_pcpu_stats(type, gfp) \ ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ -- cgit v1.2.3 From 316580b69d0a7aeeee5063af47438b626bc47cbd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 16:27:20 -0800 Subject: u64_stats: provide u64_stats_t type On 64bit arches, struct u64_stats_sync is empty and provides no help against load/store tearing. Using READ_ONCE()/WRITE_ONCE() would be needed. But the update side would be slightly more expensive. local64_t was defined so that we could use regular adds in a manner which is atomic wrt IRQs. However the u64_stats infra means we do not have to use local64_t on 32bit arches since the syncp provides the needed protection. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/u64_stats_sync.h | 51 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index a27604f99ed0..9de5c10293f5 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -40,8 +40,8 @@ * spin_lock_bh(...) or other synchronization to get exclusive access * ... * u64_stats_update_begin(&stats->syncp); - * stats->bytes64 += len; // non atomic operation - * stats->packets64++; // non atomic operation + * u64_stats_add(&stats->bytes64, len); // non atomic operation + * u64_stats_inc(&stats->packets64); // non atomic operation * u64_stats_update_end(&stats->syncp); * * While a consumer (reader) should use following template to get consistent @@ -52,8 +52,8 @@ * * do { * start = u64_stats_fetch_begin(&stats->syncp); - * tbytes = stats->bytes64; // non atomic operation - * tpackets = stats->packets64; // non atomic operation + * tbytes = u64_stats_read(&stats->bytes64); // non atomic operation + * tpackets = u64_stats_read(&stats->packets64); // non atomic operation * } while (u64_stats_fetch_retry(&stats->syncp, start)); * * @@ -68,6 +68,49 @@ struct u64_stats_sync { #endif }; +#if BITS_PER_LONG == 64 +#include + +typedef struct { + local64_t v; +} u64_stats_t ; + +static inline u64 u64_stats_read(const u64_stats_t *p) +{ + return local64_read(&p->v); +} + +static inline void u64_stats_add(u64_stats_t *p, unsigned long val) +{ + local64_add(val, &p->v); +} + +static inline void u64_stats_inc(u64_stats_t *p) +{ + local64_inc(&p->v); +} + +#else + +typedef struct { + u64 v; +} u64_stats_t; + +static inline u64 u64_stats_read(const u64_stats_t *p) +{ + return p->v; +} + +static inline void u64_stats_add(u64_stats_t *p, unsigned long val) +{ + p->v += val; +} + +static inline void u64_stats_inc(u64_stats_t *p) +{ + p->v++; +} +#endif static inline void u64_stats_init(struct u64_stats_sync *syncp) { -- cgit v1.2.3 From fd2f4737870eb866537fbbffa2b59414b9b0c0a2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 16:27:22 -0800 Subject: net: use u64_stats_t in struct pcpu_lstats In order to fix the data-race found by KCSAN, we can use the new u64_stats_t type and its accessors instead of plain u64 fields. This will still generate optimal code for both 32 and 64 bit platforms. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 461a36220cf4..f857f01234f7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2396,8 +2396,8 @@ struct pcpu_sw_netstats { } __aligned(4 * sizeof(u64)); struct pcpu_lstats { - u64 packets; - u64 bytes; + u64_stats_t packets; + u64_stats_t bytes; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); @@ -2408,8 +2408,8 @@ static inline void dev_lstats_add(struct net_device *dev, unsigned int len) struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats); u64_stats_update_begin(&lstats->syncp); - lstats->bytes += len; - lstats->packets++; + u64_stats_add(&lstats->bytes, len); + u64_stats_inc(&lstats->packets); u64_stats_update_end(&lstats->syncp); } -- cgit v1.2.3 From c305c6ae79e2ce20c22660ceda94f0d86d639a82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 18:29:11 -0800 Subject: net: add annotations on hh->hh_len lockless accesses KCSAN reported a data-race [1] While we can use READ_ONCE() on the read sides, we need to make sure hh->hh_len is written last. [1] BUG: KCSAN: data-race in eth_header_cache / neigh_resolve_output write to 0xffff8880b9dedcb8 of 4 bytes by task 29760 on cpu 0: eth_header_cache+0xa9/0xd0 net/ethernet/eth.c:247 neigh_hh_init net/core/neighbour.c:1463 [inline] neigh_resolve_output net/core/neighbour.c:1480 [inline] neigh_resolve_output+0x415/0x470 net/core/neighbour.c:1470 neigh_output include/net/neighbour.h:511 [inline] ip6_finish_output2+0x7a2/0xec0 net/ipv6/ip6_output.c:116 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ndisc_send_skb+0x459/0x5f0 net/ipv6/ndisc.c:505 ndisc_send_ns+0x207/0x430 net/ipv6/ndisc.c:647 rt6_probe_deferred+0x98/0xf0 net/ipv6/route.c:615 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 read to 0xffff8880b9dedcb8 of 4 bytes by task 29572 on cpu 1: neigh_resolve_output net/core/neighbour.c:1479 [inline] neigh_resolve_output+0x113/0x470 net/core/neighbour.c:1470 neigh_output include/net/neighbour.h:511 [inline] ip6_finish_output2+0x7a2/0xec0 net/ipv6/ip6_output.c:116 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ndisc_send_skb+0x459/0x5f0 net/ipv6/ndisc.c:505 ndisc_send_ns+0x207/0x430 net/ipv6/ndisc.c:647 rt6_probe_deferred+0x98/0xf0 net/ipv6/route.c:615 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 29572 Comm: kworker/1:4 Not tainted 5.4.0-rc6+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events rt6_probe_deferred Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 50a67bd6a434..6a86e49181db 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -468,7 +468,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb do { seq = read_seqbegin(&hh->hh_lock); - hh_len = hh->hh_len; + hh_len = READ_ONCE(hh->hh_len); if (likely(hh_len <= HH_DATA_MOD)) { hh_alen = HH_DATA_MOD; -- cgit v1.2.3 From f8cc62ca3e660ae3fdaee533b1d554297cd2ae82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 18:49:43 -0800 Subject: net: add a READ_ONCE() in skb_peek_tail() skb_peek_tail() can be used without protection of a lock, as spotted by KCSAN [1] In order to avoid load-stearing, add a READ_ONCE() Note that the corresponding WRITE_ONCE() are already there. [1] BUG: KCSAN: data-race in sk_wait_data / skb_queue_tail read to 0xffff8880b36a4118 of 8 bytes by task 20426 on cpu 1: skb_peek_tail include/linux/skbuff.h:1784 [inline] sk_wait_data+0x15b/0x250 net/core/sock.c:2477 kcm_wait_data+0x112/0x1f0 net/kcm/kcmsock.c:1103 kcm_recvmsg+0xac/0x320 net/kcm/kcmsock.c:1130 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 ___sys_recvmsg+0x1a0/0x3e0 net/socket.c:2480 do_recvmmsg+0x19a/0x5c0 net/socket.c:2601 __sys_recvmmsg+0x1ef/0x200 net/socket.c:2680 __do_sys_recvmmsg net/socket.c:2703 [inline] __se_sys_recvmmsg net/socket.c:2696 [inline] __x64_sys_recvmmsg+0x89/0xb0 net/socket.c:2696 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 write to 0xffff8880b36a4118 of 8 bytes by task 451 on cpu 0: __skb_insert include/linux/skbuff.h:1852 [inline] __skb_queue_before include/linux/skbuff.h:1958 [inline] __skb_queue_tail include/linux/skbuff.h:1991 [inline] skb_queue_tail+0x7e/0xc0 net/core/skbuff.c:3145 kcm_queue_rcv_skb+0x202/0x310 net/kcm/kcmsock.c:206 kcm_rcv_strparser+0x74/0x4b0 net/kcm/kcmsock.c:370 __strp_recv+0x348/0xf50 net/strparser/strparser.c:309 strp_recv+0x84/0xa0 net/strparser/strparser.c:343 tcp_read_sock+0x174/0x5c0 net/ipv4/tcp.c:1639 strp_read_sock+0xd4/0x140 net/strparser/strparser.c:366 do_strp_work net/strparser/strparser.c:414 [inline] strp_work+0x9a/0xe0 net/strparser/strparser.c:423 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 451 Comm: kworker/u4:3 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: kstrp strp_work Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 53238ac725a3..dfe02b658829 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1795,7 +1795,7 @@ static inline struct sk_buff *skb_peek_next(struct sk_buff *skb, */ static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_) { - struct sk_buff *skb = list_->prev; + struct sk_buff *skb = READ_ONCE(list_->prev); if (skb == (struct sk_buff *)list_) skb = NULL; @@ -1861,7 +1861,9 @@ static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) { - /* see skb_queue_empty_lockless() for the opposite READ_ONCE() */ + /* See skb_queue_empty_lockless() and skb_peek_tail() + * for the opposite READ_ONCE() + */ WRITE_ONCE(newsk->next, next); WRITE_ONCE(newsk->prev, prev); WRITE_ONCE(next->prev, newsk); -- cgit v1.2.3 From 6912daed05e1370af5253aea6f2116805c0e57f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 23 Oct 2019 11:59:00 +0200 Subject: mac80211: Shrink the size of ack_frame_id to make room for tx_time_est MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To implement airtime queue limiting, we need to keep a running account of the estimated airtime of all skbs queued into the device. Do to this correctly, we need to store the airtime estimate into the skb so we can decrease the outstanding balance when the skb is freed. This means that the time estimate must be stored somewhere that will survive for the lifetime of the skb. To get this, decrease the size of the ack_frame_id field to 6 bits, and lower the size of the ID space accordingly. This leaves 10 bits for use for tx_time_est, which is enough to store a maximum of 4096 us, if we shift the values so they become units of 4us. Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/157182474063.150713.16132669599100802716.stgit@toke.dk Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f5996960eace..c643a19dce96 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -967,6 +967,7 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * @band: the band to transmit on (use for checking for races) * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC * @ack_frame_id: internal frame ID for TX status, used internally + * @tx_time_est: TX time estimate in units of 4us, used internally * @control: union part for control data * @control.rates: TX rates array to try * @control.rts_cts_rate_idx: rate for RTS or CTS @@ -1007,7 +1008,8 @@ struct ieee80211_tx_info { u8 hw_queue; - u16 ack_frame_id; + u16 ack_frame_id:6; + u16 tx_time_est:10; union { struct { -- cgit v1.2.3 From 14f34e36b36ceede9877ca422a62fcac17b52023 Mon Sep 17 00:00:00 2001 From: Gurumoorthi Gnanasambandhan Date: Thu, 31 Oct 2019 23:46:40 +0200 Subject: cfg80211: VLAN offload support for set_key and set_sta_vlan This provides an alternative mechanism for AP VLAN support where a single netdev is used with VLAN tagged frames instead of separate netdevs for each VLAN without tagged frames from the WLAN driver. By setting NL80211_EXT_FEATURE_VLAN_OFFLOAD flag the driver indicates support for a single netdev with VLAN tagged frames. Separate VLAN-specific netdevs can be added using RTM_NEWLINK/IFLA_VLAN_ID similarly to Ethernet. NL80211_CMD_NEW_KEY (for group keys), NL80211_CMD_NEW_STATION, and NL80211_CMD_SET_STATION will optionally specify vlan_id using NL80211_ATTR_VLAN_ID. Signed-off-by: Gurumoorthi Gnanasambandhan Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20191031214640.5012-1-jouni@codeaurora.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4ab2c49423dc..e309cc826b40 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -565,6 +565,7 @@ struct vif_params { * with the get_key() callback, must be in little endian, * length given by @seq_len. * @seq_len: length of @seq. + * @vlan_id: vlan_id for VLAN group key (if nonzero) * @mode: key install mode (RX_TX, NO_TX or SET_TX) */ struct key_params { @@ -572,6 +573,7 @@ struct key_params { const u8 *seq; int key_len; int seq_len; + u16 vlan_id; u32 cipher; enum nl80211_key_mode mode; }; @@ -1124,6 +1126,7 @@ struct sta_txpwr { * (bitmask of BIT(%NL80211_STA_FLAG_...)) * @listen_interval: listen interval or -1 for no change * @aid: AID or zero for no change + * @vlan_id: VLAN ID for station (if nonzero) * @peer_aid: mesh peer AID or zero for no change * @plink_action: plink action to take * @plink_state: set the peer link state for a station @@ -1159,6 +1162,7 @@ struct station_parameters { u32 sta_modify_mask; int listen_interval; u16 aid; + u16 vlan_id; u16 peer_aid; u8 supported_rates_len; u8 plink_action; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 64135ab3a7ac..341e0e8cae46 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -248,6 +248,22 @@ * %NL80211_ATTR_SAE_PASSWORD. */ +/** + * DOC: VLAN offload support for setting group keys and binding STAs to VLANs + * + * By setting @NL80211_EXT_FEATURE_VLAN_OFFLOAD flag drivers can indicate they + * support offloading VLAN functionality in a manner where the driver exposes a + * single netdev that uses VLAN tagged frames and separate VLAN-specific netdevs + * can then be added using RTM_NEWLINK/IFLA_VLAN_ID similarly to the Ethernet + * case. Frames received from stations that are not assigned to any VLAN are + * delivered on the main netdev and frames to such stations can be sent through + * that main netdev. + * + * %NL80211_CMD_NEW_KEY (for group keys), %NL80211_CMD_NEW_STATION, and + * %NL80211_CMD_SET_STATION will optionally specify vlan_id using + * %NL80211_ATTR_VLAN_ID. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -2381,6 +2397,9 @@ enum nl80211_commands { * the allowed channel bandwidth configurations. (u8 attribute) * Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13. * + * @NL80211_ATTR_VLAN_ID: VLAN ID (1..4094) for the station and VLAN group key + * (u16). + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2843,6 +2862,8 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_EDMG_CHANNELS, NL80211_ATTR_WIPHY_EDMG_BW_CONFIG, + NL80211_ATTR_VLAN_ID, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5492,6 +5513,10 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SAE_OFFLOAD: Device wants to do SAE authentication in * station mode (SAE password is passed as part of the connect command). * + * @NL80211_EXT_FEATURE_VLAN_OFFLOAD: The driver supports a single netdev + * with VLAN tagged frames and separate VLAN-specific netdevs added using + * vconfig similarly to the Ethernet case. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5537,6 +5562,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_EXT_KEY_ID, NL80211_EXT_FEATURE_STA_TX_PWR, NL80211_EXT_FEATURE_SAE_OFFLOAD, + NL80211_EXT_FEATURE_VLAN_OFFLOAD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 5bd90b0989731520f2cdcfbbe467f1271f3cc803 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 7 Nov 2019 16:04:11 +0000 Subject: KVM: vgic-v4: Track the number of VLPIs per vcpu In order to find out whether a vcpu is likely to be the target of VLPIs (and to further optimize the way we deal with those), let's track the number of VLPIs a vcpu can receive. This gets implemented with an atomic variable that gets incremented or decremented on map, unmap and move of a VLPI. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Reviewed-by: Christoffer Dall Link: https://lore.kernel.org/r/20191107160412.30301-2-maz@kernel.org --- include/linux/irqchip/arm-gic-v4.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index ab1396afe08a..5dbcfc65f21e 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -32,6 +32,8 @@ struct its_vm { struct its_vpe { struct page *vpt_page; struct its_vm *its_vm; + /* per-vPE VLPI tracking */ + atomic_t vlpi_count; /* Doorbell interrupt */ int irq; irq_hw_number_t vpe_db_lpi; -- cgit v1.2.3 From 90b2be27bb0e56483f335cc10fb59ec66882b949 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Nov 2019 08:45:23 -0800 Subject: net/sched: annotate lockless accesses to qdisc->empty KCSAN reported the following race [1] BUG: KCSAN: data-race in __dev_queue_xmit / net_tx_action read to 0xffff8880ba403508 of 1 bytes by task 21814 on cpu 1: __dev_xmit_skb net/core/dev.c:3389 [inline] __dev_queue_xmit+0x9db/0x1b40 net/core/dev.c:3761 dev_queue_xmit+0x21/0x30 net/core/dev.c:3825 neigh_hh_output include/net/neighbour.h:500 [inline] neigh_output include/net/neighbour.h:509 [inline] ip6_finish_output2+0x873/0xec0 net/ipv6/ip6_output.c:116 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] ip6_local_out+0x74/0x90 net/ipv6/output_core.c:179 ip6_send_skb+0x53/0x110 net/ipv6/ip6_output.c:1795 udp_v6_send_skb.isra.0+0x3ec/0xa70 net/ipv6/udp.c:1173 udpv6_sendmsg+0x1906/0x1c20 net/ipv6/udp.c:1471 inet6_sendmsg+0x6d/0x90 net/ipv6/af_inet6.c:576 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 ___sys_sendmsg+0x2b7/0x5d0 net/socket.c:2311 __sys_sendmmsg+0x123/0x350 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x64/0x80 net/socket.c:2439 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 write to 0xffff8880ba403508 of 1 bytes by interrupt on cpu 0: qdisc_run_begin include/net/sch_generic.h:160 [inline] qdisc_run include/net/pkt_sched.h:120 [inline] net_tx_action+0x2b1/0x6c0 net/core/dev.c:4551 __do_softirq+0x115/0x33f kernel/softirq.c:292 do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1082 do_softirq.part.0+0x6b/0x80 kernel/softirq.c:337 do_softirq kernel/softirq.c:329 [inline] __local_bh_enable_ip+0x76/0x80 kernel/softirq.c:189 local_bh_enable include/linux/bottom_half.h:32 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:688 [inline] ip6_finish_output2+0x7bb/0xec0 net/ipv6/ip6_output.c:117 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] ip6_local_out+0x74/0x90 net/ipv6/output_core.c:179 ip6_send_skb+0x53/0x110 net/ipv6/ip6_output.c:1795 udp_v6_send_skb.isra.0+0x3ec/0xa70 net/ipv6/udp.c:1173 udpv6_sendmsg+0x1906/0x1c20 net/ipv6/udp.c:1471 inet6_sendmsg+0x6d/0x90 net/ipv6/af_inet6.c:576 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 ___sys_sendmsg+0x2b7/0x5d0 net/socket.c:2311 __sys_sendmmsg+0x123/0x350 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x64/0x80 net/socket.c:2439 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 21817 Comm: syz-executor.2 Not tainted 5.4.0-rc6+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: d518d2ed8640 ("net/sched: fix race between deactivation and dequeue for NOLOCK qdisc") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Paolo Abeni Cc: Davide Caratti Signed-off-by: David S. Miller --- include/net/sch_generic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a8b0a9a4c686..d43da37737be 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -148,8 +148,8 @@ static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) static inline bool qdisc_is_empty(const struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) - return qdisc->empty; - return !qdisc->q.qlen; + return READ_ONCE(qdisc->empty); + return !READ_ONCE(qdisc->q.qlen); } static inline bool qdisc_run_begin(struct Qdisc *qdisc) @@ -157,7 +157,7 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) if (qdisc->flags & TCQ_F_NOLOCK) { if (!spin_trylock(&qdisc->seqlock)) return false; - qdisc->empty = false; + WRITE_ONCE(qdisc->empty, false); } else if (qdisc_is_running(qdisc)) { return false; } -- cgit v1.2.3 From 134bdac397661a5841d9f27f508190c68b26232b Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 8 Nov 2019 12:05:10 +0700 Subject: tipc: add new AEAD key structure for user API The new structure 'tipc_aead_key' is added to the 'tipc.h' for user to be able to transfer a key to TIPC in kernel. Netlink will be used for this purpose in the later commits. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 76421b878767..add01db1daef 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -233,6 +233,27 @@ struct tipc_sioc_nodeid_req { char node_id[TIPC_NODEID_LEN]; }; +/* + * TIPC Crypto, AEAD + */ +#define TIPC_AEAD_ALG_NAME (32) + +struct tipc_aead_key { + char alg_name[TIPC_AEAD_ALG_NAME]; + unsigned int keylen; /* in bytes */ + char key[]; +}; + +#define TIPC_AEAD_KEYLEN_MIN (16 + 4) +#define TIPC_AEAD_KEYLEN_MAX (32 + 4) +#define TIPC_AEAD_KEY_SIZE_MAX (sizeof(struct tipc_aead_key) + \ + TIPC_AEAD_KEYLEN_MAX) + +static inline int tipc_aead_key_size(struct tipc_aead_key *key) +{ + return sizeof(*key) + key->keylen; +} + /* The macros and functions below are deprecated: */ -- cgit v1.2.3 From e1f32190cf7ddd55778b460e7d44af3f76529698 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 8 Nov 2019 12:05:12 +0700 Subject: tipc: add support for AEAD key setting via netlink This commit adds two netlink commands to TIPC in order for user to be able to set or remove AEAD keys: - TIPC_NL_KEY_SET - TIPC_NL_KEY_FLUSH When the 'KEY_SET' is given along with the key data, the key will be initiated and attached to TIPC crypto. On the other hand, the 'KEY_FLUSH' command will remove all existing keys if any. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index efb958fd167d..6c2194ab745b 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -63,6 +63,8 @@ enum { TIPC_NL_PEER_REMOVE, TIPC_NL_BEARER_ADD, TIPC_NL_UDP_GET_REMOTEIP, + TIPC_NL_KEY_SET, + TIPC_NL_KEY_FLUSH, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -160,6 +162,8 @@ enum { TIPC_NLA_NODE_UNSPEC, TIPC_NLA_NODE_ADDR, /* u32 */ TIPC_NLA_NODE_UP, /* flag */ + TIPC_NLA_NODE_ID, /* data */ + TIPC_NLA_NODE_KEY, /* data */ __TIPC_NLA_NODE_MAX, TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 -- cgit v1.2.3 From a0c76345e3d3dbc40c39de2e00d15a3b7eef7885 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 8 Nov 2019 21:42:43 +0100 Subject: devlink: disallow reload operation during device cleanup There is a race between driver code that does setup/cleanup of device and devlink reload operation that in some drivers works with the same code. Use after free could we easily obtained by running: while true; do echo 10 > /sys/bus/netdevsim/new_device devlink dev reload netdevsim/netdevsim10 & echo 10 > /sys/bus/netdevsim/del_device done Fix this by enabling reload only after setup of device is complete and disabling it at the beginning of the cleanup process. Reported-by: Ido Schimmel Fixes: 2d8dc5bbf4e7 ("devlink: Add support for reload") Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 8d6b5846822c..7891611868e4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -38,8 +38,9 @@ struct devlink { struct device *dev; possible_net_t _net; struct mutex lock; - bool reload_failed; - bool registered; + u8 reload_failed:1, + reload_enabled:1, + registered:1; char priv[0] __aligned(NETDEV_ALIGN); }; @@ -824,6 +825,8 @@ void devlink_net_set(struct devlink *devlink, struct net *net); struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); int devlink_register(struct devlink *devlink, struct device *dev); void devlink_unregister(struct devlink *devlink); +void devlink_reload_enable(struct devlink *devlink); +void devlink_reload_disable(struct devlink *devlink); void devlink_free(struct devlink *devlink); int devlink_port_register(struct devlink *devlink, struct devlink_port *devlink_port, -- cgit v1.2.3 From aef587be42925f92418083f08852d0011b2766ca Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:32 +0800 Subject: sctp: add pf_expose per netns and sock and asoc As said in rfc7829, section 3, point 12: The SCTP stack SHOULD expose the PF state of its destination addresses to the ULP as well as provide the means to notify the ULP of state transitions of its destination addresses from active to PF, and vice versa. However, it is recommended that an SCTP stack implementing SCTP-PF also allows for the ULP to be kept ignorant of the PF state of its destinations and the associated state transitions, thus allowing for retention of the simpler state transition model of [RFC4960] in the ULP. Not only does it allow to expose the PF state to ULP, but also allow to ignore sctp-pf to ULP. So this patch is to add pf_expose per netns, sock and asoc. And in sctp_assoc_control_transport(), ulp_notify will be set to false if asoc->expose is not 'enabled' in next patch. It also allows a user to change pf_expose per netns by sysctl, and pf_expose per sock and asoc will be initialized with it. Note that pf_expose also works for SCTP_GET_PEER_ADDR_INFO sockopt, to not allow a user to query the state of a sctp-pf peer address when pf_expose is 'disabled', as said in section 7.3. v1->v2: - Fix a build warning noticed by Nathan Chancellor. v2->v3: - set pf_expose to UNUSED by default to keep compatible with old applications. v3->v4: - add a new entry for pf_expose on ip-sysctl.txt, as Marcelo suggested. - change this patch to 1/5, and move sctp_assoc_control_transport change into 2/5, as Marcelo suggested. - use SCTP_PF_EXPOSE_UNSET instead of SCTP_PF_EXPOSE_UNUSED, and set SCTP_PF_EXPOSE_UNSET to 0 in enum, as Marcelo suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/netns/sctp.h | 8 ++++++++ include/net/sctp/constants.h | 10 ++++++++++ include/net/sctp/structs.h | 2 ++ include/uapi/linux/sctp.h | 1 + 4 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index bdc0f27b8514..18c3ddae77a3 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -96,6 +96,14 @@ struct netns_sctp { */ int pf_enable; + /* + * Disable Potentially-Failed state exposure, ignored by default + * pf_expose - 0 : compatible with old applications (by default) + * - 1 : disable pf state exposure + * - 2 : enable pf state exposure + */ + int pf_expose; + /* * Policy for preforming sctp/socket accounting * 0 - do socket level accounting, all assocs share sk_sndbuf diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 823afc42a3aa..e88b77a34cb1 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -286,6 +286,16 @@ enum { SCTP_MAX_GABS = 16 }; * functions simpler to write. */ +/* These are the values for pf exposure, UNUSED is to keep compatible with old + * applications by default. + */ +enum { + SCTP_PF_EXPOSE_UNSET, + SCTP_PF_EXPOSE_DISABLE, + SCTP_PF_EXPOSE_ENABLE, +}; +#define SCTP_PF_EXPOSE_MAX SCTP_PF_EXPOSE_ENABLE + /* These return values describe the success or failure of a number of * routines which form the lower interface to SCTP_outqueue. */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 503fbc3cd819..9a43738774d7 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -215,6 +215,7 @@ struct sctp_sock { __u32 adaptation_ind; __u32 pd_point; __u16 nodelay:1, + pf_expose:2, reuse:1, disable_fragments:1, v4mapped:1, @@ -2053,6 +2054,7 @@ struct sctp_association { __u8 need_ecne:1, /* Need to send an ECNE Chunk? */ temp:1, /* Is it a temporary association? */ + pf_expose:2, /* Expose pf state? */ force_delay:1; __u8 strreset_enable; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6bce7f9837a9..765f41a080b4 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -933,6 +933,7 @@ struct sctp_paddrinfo { enum sctp_spinfo_state { SCTP_INACTIVE, SCTP_PF, +#define SCTP_POTENTIALLY_FAILED SCTP_PF SCTP_ACTIVE, SCTP_UNCONFIRMED, SCTP_UNKNOWN = 0xffff /* Value used for transport state unknown */ -- cgit v1.2.3 From 768e15182dcb809e39c338290dda10c4e271d133 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:33 +0800 Subject: sctp: add SCTP_ADDR_POTENTIALLY_FAILED notification SCTP Quick failover draft section 5.1, point 5 has been removed from rfc7829. Instead, "the sender SHOULD (i) notify the Upper Layer Protocol (ULP) about this state transition", as said in section 3.2, point 8. So this patch is to add SCTP_ADDR_POTENTIALLY_FAILED, defined in section 7.1, "which is reported if the affected address becomes PF". Also remove transport cwnd's update when moving from PF back to ACTIVE , which is no longer in rfc7829 either. Note that ulp_notify will be set to false if asoc->expose is not 'enabled', according to last patch. v2->v3: - define SCTP_ADDR_PF SCTP_ADDR_POTENTIALLY_FAILED. v3->v4: - initialize spc_state with SCTP_ADDR_AVAILABLE, as Marcelo suggested. - check asoc->pf_expose in sctp_assoc_control_transport(), as Marcelo suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 765f41a080b4..d99b428ac34e 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -410,6 +410,8 @@ enum sctp_spc_state { SCTP_ADDR_ADDED, SCTP_ADDR_MADE_PRIM, SCTP_ADDR_CONFIRMED, + SCTP_ADDR_POTENTIALLY_FAILED, +#define SCTP_ADDR_PF SCTP_ADDR_POTENTIALLY_FAILED }; -- cgit v1.2.3 From 8d2a6935d842f12c25611b165eace778adb09a53 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:34 +0800 Subject: sctp: add SCTP_EXPOSE_POTENTIALLY_FAILED_STATE sockopt This is a sockopt defined in section 7.3 of rfc7829: "Exposing the Potentially Failed Path State", by which users can change pf_expose per sock and asoc. The new sockopt SCTP_EXPOSE_POTENTIALLY_FAILED_STATE is also known as SCTP_EXPOSE_PF_STATE for short. v2->v3: - return -EINVAL if params.assoc_value > SCTP_PF_EXPOSE_MAX. - define SCTP_EXPOSE_PF_STATE SCTP_EXPOSE_POTENTIALLY_FAILED_STATE. v3->v4: - improve changelog. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index d99b428ac34e..a190e4a7f546 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -137,6 +137,8 @@ typedef __s32 sctp_assoc_t; #define SCTP_ASCONF_SUPPORTED 128 #define SCTP_AUTH_SUPPORTED 129 #define SCTP_ECN_SUPPORTED 130 +#define SCTP_EXPOSE_POTENTIALLY_FAILED_STATE 131 +#define SCTP_EXPOSE_PF_STATE SCTP_EXPOSE_POTENTIALLY_FAILED_STATE /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 -- cgit v1.2.3 From 34515e94c92c3f593cd696abca8609246cbd75e6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:35 +0800 Subject: sctp: add support for Primary Path Switchover This is a new feature defined in section 5 of rfc7829: "Primary Path Switchover". By introducing a new tunable parameter: Primary.Switchover.Max.Retrans (PSMR) The primary path will be changed to another active path when the path error counter on the old primary path exceeds PSMR, so that "the SCTP sender is allowed to continue data transmission on a new working path even when the old primary destination address becomes active again". This patch is to add this tunable parameter, 'ps_retrans' per netns, sock, asoc and transport. It also allows a user to change ps_retrans per netns by sysctl, and ps_retrans per sock/asoc/transport will be initialized with it. The check will be done in sctp_do_8_2_transport_strike() when this feature is enabled. Note this feature is disabled by initializing 'ps_retrans' per netns as 0xffff by default, and its value can't be less than 'pf_retrans' when changing by sysctl. v3->v4: - add define SCTP_PS_RETRANS_MAX 0xffff, and use it on extra2 of sysctl 'ps_retrans'. - add a new entry for ps_retrans on ip-sysctl.txt. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/netns/sctp.h | 6 ++++++ include/net/sctp/constants.h | 2 ++ include/net/sctp/structs.h | 11 ++++++++--- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index 18c3ddae77a3..d8d02e4188d1 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -89,6 +89,12 @@ struct netns_sctp { */ int pf_retrans; + /* Primary.Switchover.Max.Retrans sysctl value + * taken from: + * https://tools.ietf.org/html/rfc7829 + */ + int ps_retrans; + /* * Disable Potentially-Failed feature, the feature is enabled by default * pf_enable - 0 : disable pf diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index e88b77a34cb1..15b4d9aec7ff 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -296,6 +296,8 @@ enum { }; #define SCTP_PF_EXPOSE_MAX SCTP_PF_EXPOSE_ENABLE +#define SCTP_PS_RETRANS_MAX 0xffff + /* These return values describe the success or failure of a number of * routines which form the lower interface to SCTP_outqueue. */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 9a43738774d7..3cc913f328cd 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -184,7 +184,8 @@ struct sctp_sock { __u32 flowlabel; __u8 dscp; - int pf_retrans; + __u16 pf_retrans; + __u16 ps_retrans; /* The initial Path MTU to use for new associations. */ __u32 pathmtu; @@ -897,7 +898,9 @@ struct sctp_transport { * and will be initialized from the assocs value. This can be changed * using the SCTP_PEER_ADDR_THLDS socket option */ - int pf_retrans; + __u16 pf_retrans; + /* Used for primary path switchover. */ + __u16 ps_retrans; /* PMTU : The current known path MTU. */ __u32 pathmtu; @@ -1773,7 +1776,9 @@ struct sctp_association { * and will be initialized from the assocs value. This can be * changed using the SCTP_PEER_ADDR_THLDS socket option */ - int pf_retrans; + __u16 pf_retrans; + /* Used for primary path switchover. */ + __u16 ps_retrans; /* Maximum number of times the endpoint will retransmit INIT */ __u16 max_init_attempts; -- cgit v1.2.3 From d467ac0a38551a5904878b1f5a2fe20a040c0e11 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:36 +0800 Subject: sctp: add SCTP_PEER_ADDR_THLDS_V2 sockopt Section 7.2 of rfc7829: "Peer Address Thresholds (SCTP_PEER_ADDR_THLDS) Socket Option" extends 'struct sctp_paddrthlds' with 'spt_pathcpthld' added to allow a user to change ps_retrans per sock/asoc/transport, as other 2 paddrthlds: pf_retrans, pathmaxrxt. Note: to not break the user's program, here to support pf_retrans dump and setting by adding a new sockopt SCTP_PEER_ADDR_THLDS_V2, and a new structure sctp_paddrthlds_v2 instead of extending sctp_paddrthlds. Also, when setting ps_retrans, the value is not allowed to be greater than pf_retrans. v1->v2: - use SCTP_PEER_ADDR_THLDS_V2 to set/get pf_retrans instead, as Marcelo and David Laight suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index a190e4a7f546..28ad40d9acba 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -105,6 +105,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_DEFAULT_SNDINFO 34 #define SCTP_AUTH_DEACTIVATE_KEY 35 #define SCTP_REUSE_PORT 36 +#define SCTP_PEER_ADDR_THLDS_V2 37 /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. @@ -1087,6 +1088,15 @@ struct sctp_paddrthlds { __u16 spt_pathpfthld; }; +/* Use a new structure with spt_pathcpthld for back compatibility */ +struct sctp_paddrthlds_v2 { + sctp_assoc_t spt_assoc_id; + struct sockaddr_storage spt_address; + __u16 spt_pathmaxrxt; + __u16 spt_pathpfthld; + __u16 spt_pathcpthld; +}; + /* * Socket Option for Getting the Association/Stream-Specific PR-SCTP Status */ -- cgit v1.2.3 From bc9ad9e40dbc4c8874e806345df393a9cfeadad3 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:02 +0000 Subject: EDAC: Replace EDAC_DIMM_PTR() macro with edac_get_dimm() function The EDAC_DIMM_PTR() macro takes 3 arguments from struct mem_ctl_info. Clean up this interface to only pass the mci struct and replace this macro with a new function edac_get_dimm(). Also introduce an edac_get_dimm_by_index() function for later use. This allows it to get a DIMM pointer only by a given index. This can be useful if the DIMM's position within the layers of the memory controller or the exact size of the layers are unknown. Small style changes made for some hunks after applying the semantic patch. Semantic patch used: @@ expression mci, a, b,c; @@ -EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, a, b, c) +edac_get_dimm(mci, a, b, c) [ bp: Touchups. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Jason Baron Cc: Qiuxu Zhuo Cc: Tero Kristo Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-2-rrichter@marvell.com --- include/linux/edac.h | 89 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/edac.h b/include/linux/edac.h index c19483b90079..280d109b9d05 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -403,37 +403,6 @@ struct edac_mc_layer { __i; \ }) -/** - * EDAC_DIMM_PTR - Macro responsible to get a pointer inside a pointer array - * for the element given by [layer0,layer1,layer2] position - * - * @layers: a struct edac_mc_layer array, describing how many elements - * were allocated for each layer - * @var: name of the var where we want to get the pointer - * (like mci->dimms) - * @nlayers: Number of layers at the @layers array - * @layer0: layer0 position - * @layer1: layer1 position. Unused if n_layers < 2 - * @layer2: layer2 position. Unused if n_layers < 3 - * - * For 1 layer, this macro returns "var[layer0]"; - * - * For 2 layers, this macro is similar to allocate a bi-dimensional array - * and to return "var[layer0][layer1]"; - * - * For 3 layers, this macro is similar to allocate a tri-dimensional array - * and to return "var[layer0][layer1][layer2]"; - */ -#define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({ \ - typeof(*var) __p; \ - int ___i = EDAC_DIMM_OFF(layers, nlayers, layer0, layer1, layer2); \ - if (___i < 0) \ - __p = NULL; \ - else \ - __p = (var)[___i]; \ - __p; \ -}) - struct dimm_info { struct device dev; @@ -669,4 +638,60 @@ struct mem_ctl_info { bool fake_inject_ue; u16 fake_inject_count; }; -#endif + +/** + * edac_get_dimm_by_index - Get DIMM info at @index from a memory + * controller + * + * @mci: MC descriptor struct mem_ctl_info + * @index: index in the memory controller's DIMM array + * + * Returns a struct dimm_info * or NULL on failure. + */ +static inline struct dimm_info * +edac_get_dimm_by_index(struct mem_ctl_info *mci, int index) +{ + if (index < 0 || index >= mci->tot_dimms) + return NULL; + + return mci->dimms[index]; +} + +/** + * edac_get_dimm - Get DIMM info from a memory controller given by + * [layer0,layer1,layer2] position + * + * @mci: MC descriptor struct mem_ctl_info + * @layer0: layer0 position + * @layer1: layer1 position. Unused if n_layers < 2 + * @layer2: layer2 position. Unused if n_layers < 3 + * + * For 1 layer, this function returns "dimms[layer0]"; + * + * For 2 layers, this function is similar to allocating a two-dimensional + * array and returning "dimms[layer0][layer1]"; + * + * For 3 layers, this function is similar to allocating a tri-dimensional + * array and returning "dimms[layer0][layer1][layer2]"; + */ +static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci, + int layer0, int layer1, int layer2) +{ + int index; + + if (layer0 < 0 + || (mci->n_layers > 1 && layer1 < 0) + || (mci->n_layers > 2 && layer2 < 0)) + return NULL; + + index = layer0; + + if (mci->n_layers > 1) + index = index * mci->layers[1].size + layer1; + + if (mci->n_layers > 2) + index = index * mci->layers[2].size + layer2; + + return edac_get_dimm_by_index(mci, index); +} +#endif /* _LINUX_EDAC_H_ */ -- cgit v1.2.3 From 977b1ce7c117905b3138dc727ed25f8af2ba2902 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:04 +0000 Subject: EDAC: Remove EDAC_DIMM_OFF() macro The EDAC_DIMM_OFF() macro takes 5 arguments to get the DIMM's index. Simplify this by storing the index in struct dimm_info to avoid its calculation and remove the EDAC_DIMM_OFF() macro. The index can be directly used then. Another advantage is that edac_mc_alloc() could be used even if the exact size of the layers is unknown. Only the number of DIMMs would be needed. Rename iterator variable to idx, while at it. The name is more handy, esp. when searching for it in the code. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-3-rrichter@marvell.com --- include/linux/edac.h | 45 ++++----------------------------------------- 1 file changed, 4 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/linux/edac.h b/include/linux/edac.h index 280d109b9d05..f4ebb14bc406 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -362,47 +362,6 @@ struct edac_mc_layer { */ #define EDAC_MAX_LAYERS 3 -/** - * EDAC_DIMM_OFF - Macro responsible to get a pointer offset inside a pointer - * array for the element given by [layer0,layer1,layer2] - * position - * - * @layers: a struct edac_mc_layer array, describing how many elements - * were allocated for each layer - * @nlayers: Number of layers at the @layers array - * @layer0: layer0 position - * @layer1: layer1 position. Unused if n_layers < 2 - * @layer2: layer2 position. Unused if n_layers < 3 - * - * For 1 layer, this macro returns "var[layer0] - var"; - * - * For 2 layers, this macro is similar to allocate a bi-dimensional array - * and to return "var[layer0][layer1] - var"; - * - * For 3 layers, this macro is similar to allocate a tri-dimensional array - * and to return "var[layer0][layer1][layer2] - var". - * - * A loop could be used here to make it more generic, but, as we only have - * 3 layers, this is a little faster. - * - * By design, layers can never be 0 or more than 3. If that ever happens, - * a NULL is returned, causing an OOPS during the memory allocation routine, - * with would point to the developer that he's doing something wrong. - */ -#define EDAC_DIMM_OFF(layers, nlayers, layer0, layer1, layer2) ({ \ - int __i; \ - if ((nlayers) == 1) \ - __i = layer0; \ - else if ((nlayers) == 2) \ - __i = (layer1) + ((layers[1]).size * (layer0)); \ - else if ((nlayers) == 3) \ - __i = (layer2) + ((layers[2]).size * ((layer1) + \ - ((layers[1]).size * (layer0)))); \ - else \ - __i = -EINVAL; \ - __i; \ -}) - struct dimm_info { struct device dev; @@ -412,6 +371,7 @@ struct dimm_info { unsigned int location[EDAC_MAX_LAYERS]; struct mem_ctl_info *mci; /* the parent */ + unsigned int idx; /* index within the parent dimm array */ u32 grain; /* granularity of reported error in bytes */ enum dev_type dtype; /* memory device type */ @@ -654,6 +614,9 @@ edac_get_dimm_by_index(struct mem_ctl_info *mci, int index) if (index < 0 || index >= mci->tot_dimms) return NULL; + if (WARN_ON_ONCE(mci->dimms[index]->idx != index)) + return NULL; + return mci->dimms[index]; } -- cgit v1.2.3 From 727b3668b730634228fc65c336c2a7a080e02885 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 8 Nov 2019 17:39:29 +0000 Subject: net: sfp: rework upstream interface The current upstream interface is an all-or-nothing, which is sub-optimal for future changes, as it doesn't allow the upstream driver to prepare for the SFP module becoming available, as it is at boot. Switch to a find-sfp-bus, add-upstream, del-upstream, put-sfp-bus interface structure instead, which allows the upstream driver to prepare for a module being available as soon as add-upstream is called. Signed-off-by: Russell King Signed-off-by: David S. Miller --- include/linux/sfp.h | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 355a08a76fd4..c8464de7cff5 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -508,10 +508,11 @@ int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee, u8 *data); void sfp_upstream_start(struct sfp_bus *bus); void sfp_upstream_stop(struct sfp_bus *bus); -struct sfp_bus *sfp_register_upstream_node(struct fwnode_handle *fwnode, - void *upstream, - const struct sfp_upstream_ops *ops); -void sfp_unregister_upstream(struct sfp_bus *bus); +void sfp_bus_put(struct sfp_bus *bus); +struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode); +int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, + const struct sfp_upstream_ops *ops); +void sfp_bus_del_upstream(struct sfp_bus *bus); #else static inline int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, @@ -553,14 +554,22 @@ static inline void sfp_upstream_stop(struct sfp_bus *bus) { } -static inline struct sfp_bus *sfp_register_upstream_node( - struct fwnode_handle *fwnode, void *upstream, - const struct sfp_upstream_ops *ops) +static inline void sfp_bus_put(struct sfp_bus *bus) +{ +} + +static inline struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode) { return NULL; } -static inline void sfp_unregister_upstream(struct sfp_bus *bus) +static int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, + const struct sfp_upstream_ops *ops) +{ + return 0; +} + +static inline void sfp_bus_del_upstream(struct sfp_bus *bus) { } #endif -- cgit v1.2.3 From c498afaf7df87f44e7cb383c135baec52b5259be Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:07 +0000 Subject: EDAC: Introduce an mci_for_each_dimm() iterator Introduce an mci_for_each_dimm() iterator. It returns a pointer to a struct dimm_info. This makes the declaration and use of an index obsolete and avoids access to internal data of struct mci (direct array access etc). [ bp: push the struct dimm_info *dimm; declaration into the CONFIG_EDAC_DEBUG block. ] Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-4-rrichter@marvell.com --- include/linux/edac.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/edac.h b/include/linux/edac.h index f4ebb14bc406..31f99d48b024 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -599,6 +599,13 @@ struct mem_ctl_info { u16 fake_inject_count; }; +#define mci_for_each_dimm(mci, dimm) \ + for ((dimm) = (mci)->dimms[0]; \ + (dimm); \ + (dimm) = (dimm)->idx + 1 < (mci)->tot_dimms \ + ? (mci)->dimms[(dimm)->idx + 1] \ + : NULL) + /** * edac_get_dimm_by_index - Get DIMM info at @index from a memory * controller -- cgit v1.2.3 From 98edb865bd3ee2a67e51e0d947208f3a2129a460 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 Nov 2019 09:33:18 +0000 Subject: EDAC: Remove misleading comment in struct edac_raw_error_desc There never has been such function edac_raw_error_desc_clean() and in function ghes_edac_report_mem_error() the whole struct is zero'ed including the string arrays. Remove that comment. Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Reviewed-by: Mauro Carvalho Chehab Cc: "linux-edac@vger.kernel.org" Cc: James Morse Cc: Tony Luck Link: https://lkml.kernel.org/r/20191106093239.25517-9-rrichter@marvell.com --- include/linux/edac.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/edac.h b/include/linux/edac.h index 31f99d48b024..cc31b9742684 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -457,15 +457,10 @@ struct errcount_attribute_data { * (typically, a memory controller error) */ struct edac_raw_error_desc { - /* - * NOTE: everything before grain won't be cleaned by - * edac_raw_error_desc_clean() - */ char location[LOCATION_SIZE]; char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS]; long grain; - /* the vars below and grain will be cleaned on every new error report */ u16 error_count; int top_layer; int mid_layer; -- cgit v1.2.3 From 39d1e3340c73e8f7eb1d6a8cae561c255ca7b1b0 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 7 Nov 2019 08:41:51 +0000 Subject: mtd: spi-nor: Fix clearing of QE bit on lock()/unlock() Make sure that when doing a lock() or an unlock() operation we don't clear the QE bit from Status Register 2. JESD216 revB or later offers information about the *default* Status Register commands to use (see BFPT DWORDS[15], bits 22:20). In this standard, Status Register 1 refers to the first data byte transferred on a Read Status (05h) or Write Status (01h) command. Status register 2 refers to the byte read using instruction 35h. Status register 2 is the second byte transferred in a Write Status (01h) command. Industry naming and definitions of these Status Registers may differ. The definitions are described in JESD216B, BFPT DWORDS[15], bits 22:20. There are cases in which writing only one byte to the Status Register 1 has the side-effect of clearing Status Register 2 and implicitly the Quad Enable bit. This side-effect is hit just by the BFPT_DWORD15_QER_SR2_BIT1_BUGGY and BFPT_DWORD15_QER_SR2_BIT1 cases. Suggested-by: Boris Brezillon Signed-off-by: Tudor Ambarus Reviewed-by: Vignesh Raghavendra --- include/linux/mtd/spi-nor.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index d1d736d3c8ab..d6ec55cc6d97 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -243,6 +243,9 @@ enum spi_nor_option_flags { SNOR_F_4B_OPCODES = BIT(6), SNOR_F_HAS_4BAIT = BIT(7), SNOR_F_HAS_LOCK = BIT(8), + SNOR_F_HAS_16BIT_SR = BIT(9), + SNOR_F_NO_READ_CR = BIT(10), + }; /** -- cgit v1.2.3 From 3e0930f109e76922ea1742a9c8c1cc16f052ad45 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 7 Nov 2019 08:41:55 +0000 Subject: mtd: spi-nor: Rework the disabling of block write protection spi_nor_unlock() unlocks blocks of memory or the entire flash memory array, if requested. clear_sr_bp() unlocks the entire flash memory array at boot time. This calls for some unification, clear_sr_bp() is just an optimization for the case when the unlock request covers the entire flash size. Get rid of clear_sr_bp() and introduce spi_nor_unlock_all(), which is just a call to spi_nor_unlock() for the entire flash memory array. This fixes a bug that was present in spi_nor_spansion_clear_sr_bp(). When the QE bit was zero, we used the Write Status (01h) command with one data byte, which might cleared the Status Register 2. We now always use the Write Status (01h) command with two data bytes when SNOR_F_HAS_16BIT_SR is set, to avoid clearing the Status Register 2. The SNOR_F_NO_READ_CR case is treated as well. When the flash doesn't support the CR Read command, we make an assumption about the value of the QE bit. In spi_nor_init(), call spi_nor_quad_enable() first, then spi_nor_unlock_all(), so that at the spi_nor_unlock_all() time we can be sure the QE bit has value one, because of the previous call to spi_nor_quad_enable(). Get rid of the MFR handling and implement specific manufacturer default_init() fixup hooks. Note that this changes a bit the logic for the SNOR_MFR_ATMEL, SNOR_MFR_INTEL and SNOR_MFR_SST cases. Before this patch, the Atmel, Intel and SST chips did not set the locking ops, but unlocked the entire flash at boot time, while now they are setting the locking ops to stm_locking_ops. This should work, since the disable of the block protection at the boot time used the same Status Register bits to unlock the flash, as in the stm_locking_ops case. Suggested-by: Boris Brezillon Signed-off-by: Tudor Ambarus Reviewed-by: Vignesh Raghavendra --- include/linux/mtd/spi-nor.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index d6ec55cc6d97..11daecc5a83d 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -581,8 +581,6 @@ struct flash_info; * @write_proto: the SPI protocol for write operations * @reg_proto the SPI protocol for read_reg/write_reg/erase operations * @controller_ops: SPI NOR controller driver specific operations. - * @clear_sr_bp: [FLASH-SPECIFIC] clears the Block Protection Bits from - * the SPI NOR Status Register. * @params: [FLASH-SPECIFIC] SPI-NOR flash parameters and settings. * The structure includes legacy flash parameters and * settings that can be overwritten by the spi_nor_fixups @@ -611,7 +609,6 @@ struct spi_nor { const struct spi_nor_controller_ops *controller_ops; - int (*clear_sr_bp)(struct spi_nor *nor); struct spi_nor_flash_parameter params; void *priv; -- cgit v1.2.3 From bb2dc7f46ad897ba1c2d8ae773c77601ba240932 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 7 Nov 2019 08:42:01 +0000 Subject: mtd: spi-nor: Rename CR_QUAD_EN_SPAN to SR2_QUAD_EN_BIT1 JEDEC Basic Flash Parameter Table, 15th DWORD, bits 22:20, refers to this bit as "bit 1 of the status register 2". Rename the macro accordingly. Signed-off-by: Tudor Ambarus Reviewed-by: Vignesh Raghavendra --- include/linux/mtd/spi-nor.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 11daecc5a83d..364309845de0 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -144,10 +144,8 @@ #define FSR_P_ERR BIT(4) /* Program operation status */ #define FSR_PT_ERR BIT(1) /* Protection error bit */ -/* Configuration Register bits. */ -#define CR_QUAD_EN_SPAN BIT(1) /* Spansion Quad I/O */ - /* Status Register 2 bits. */ +#define SR2_QUAD_EN_BIT1 BIT(1) #define SR2_QUAD_EN_BIT7 BIT(7) /* Supported SPI protocols */ -- cgit v1.2.3 From 658488ed2108f5772572c5a17c3f31ed6e554edc Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Thu, 7 Nov 2019 08:42:09 +0000 Subject: mtd: spi-nor: Rename Quad Enable methods Rename macronix_quad_enable() to a generic name: spi_nor_sr1_bit6_quad_enable(). Prepend "spi_nor_" to "sr2_bit7_quad_enable". All SPI NOR generic methods should be prepended by "spi_nor_". Signed-off-by: Tudor Ambarus Reviewed-by: Vignesh Raghavendra --- include/linux/mtd/spi-nor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 364309845de0..9eae35c60bce 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -133,7 +133,7 @@ #define SR_E_ERR BIT(5) #define SR_P_ERR BIT(6) -#define SR_QUAD_EN_MX BIT(6) /* Macronix Quad I/O */ +#define SR1_QUAD_EN_BIT6 BIT(6) /* Enhanced Volatile Configuration Register bits */ #define EVCR_QUAD_EN_MICRON BIT(7) /* Micron Quad I/O */ -- cgit v1.2.3 From 83cba933a6db1dd4d7ac85170f99461fbc339eff Mon Sep 17 00:00:00 2001 From: Sagar Shrikant Kadam Date: Tue, 22 Oct 2019 17:22:19 +0000 Subject: mtd: spi-nor: Set default Quad Enable method for ISSI flashes Set the default Quad Enable method for ISSI flashes. Used for ISSI flashes (IS25WP256D-JMLE) that do not support SFDP tables and can not determine the Quad Enable method by parsing BFPT. Based on code originally written by Wesley Terpstra and/or Palmer Dabbelt https://github.com/riscv/riscv-linux/commit/c94e267766d62bc9a669611c3d0c8ed5ea26569b Signed-off-by: Sagar Shrikant Kadam [tudor.ambarus@microchip.com: - rebase, split and adapt for latest spi-nor/next, - use PMC CFI ID for ISSI. According to JEP106BA, "Programmable Micro Corp" changed its name to Integrated Silicon Solution (ISSI)] Signed-off-by: Tudor Ambarus Reviewed-by: Vignesh Raghavendra --- include/linux/mtd/spi-nor.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 9eae35c60bce..5a4623fc586b 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -22,6 +22,7 @@ #define SNOR_MFR_INTEL CFI_MFR_INTEL #define SNOR_MFR_ST CFI_MFR_ST /* ST Micro */ #define SNOR_MFR_MICRON CFI_MFR_MICRON /* Micron */ +#define SNOR_MFR_ISSI CFI_MFR_PMC #define SNOR_MFR_MACRONIX CFI_MFR_MACRONIX #define SNOR_MFR_SPANSION CFI_MFR_AMD #define SNOR_MFR_SST CFI_MFR_SST -- cgit v1.2.3 From 6c7295e13ffd5623b02f1adc1442f1d8a3d52424 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Fri, 8 Nov 2019 23:45:20 +0000 Subject: devlink: Add new "enable_roce" generic device param New device parameter to enable/disable handling of RoCE traffic in the device. Signed-off-by: Michael Guralnik Acked-by: Jiri Pirko Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/net/devlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 23e4b65ec9df..39fb4d957838 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -400,6 +400,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -434,6 +435,9 @@ enum devlink_param_generic_id { "reset_dev_on_drv_probe" #define DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE DEVLINK_PARAM_TYPE_U8 +#define DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME "enable_roce" +#define DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE DEVLINK_PARAM_TYPE_BOOL + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ -- cgit v1.2.3 From cc9defcbb8fae52810f7795b039223edae51ef95 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Fri, 8 Nov 2019 23:45:24 +0000 Subject: net/mlx5: Handle "enable_roce" devlink param Register "enable_roce" param, default value is RoCE enabled. Current configuration is stored on mlx5_core_dev and exposed to user through the cmode runtime devlink param. Changing configuration requires changing the cmode driverinit devlink param and calling devlink reload. Signed-off-by: Michael Guralnik Acked-by: Jiri Pirko Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7b4801e96feb..1884513aac90 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1191,4 +1191,15 @@ enum { MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, }; +static inline bool mlx5_is_roce_enabled(struct mlx5_core_dev *dev) +{ + struct devlink *devlink = priv_to_devlink(dev); + union devlink_param_value val; + + devlink_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + &val); + return val.vbool; +} + #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 50ec88120ea16cf8b9aabf8422c364166ce3ee17 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 8 Oct 2019 19:20:39 +0300 Subject: can: mcp251x: get rid of legacy platform data Instead of using legacy platform data, switch to use device properties. For clock frequency we are using well established clock-frequency property. Users, two for now, are also converted here. Cc: Daniel Mack Cc: Haojian Zhuang Cc: Robert Jarzmik Cc: Russell King Signed-off-by: Andy Shevchenko Signed-off-by: Marc Kleine-Budde --- include/linux/can/platform/mcp251x.h | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 include/linux/can/platform/mcp251x.h (limited to 'include') diff --git a/include/linux/can/platform/mcp251x.h b/include/linux/can/platform/mcp251x.h deleted file mode 100644 index 9e5ac27fb6c1..000000000000 --- a/include/linux/can/platform/mcp251x.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _CAN_PLATFORM_MCP251X_H -#define _CAN_PLATFORM_MCP251X_H - -/* - * - * CAN bus driver for Microchip 251x CAN Controller with SPI Interface - * - */ - -#include - -/* - * struct mcp251x_platform_data - MCP251X SPI CAN controller platform data - * @oscillator_frequency: - oscillator frequency in Hz - */ - -struct mcp251x_platform_data { - unsigned long oscillator_frequency; -}; - -#endif /* !_CAN_PLATFORM_MCP251X_H */ -- cgit v1.2.3 From 61d2350615c2c42f7af65d9a575f5dbf9738a10e Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Mon, 7 Oct 2019 13:36:58 +0200 Subject: can: rx-offload: can_rx_offload_reset(): remove no-op function This patch removes the function can_rx_offload_reset(), as it does nothing. If we ever need this function, add it back again. Signed-off-by: Marc Kleine-Budde --- include/linux/can/rx-offload.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index 01219f2902bf..fc75e9a7ad2f 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -44,7 +44,6 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload, unsigned int idx, u32 timestamp); int can_rx_offload_queue_tail(struct can_rx_offload *offload, struct sk_buff *skb); -void can_rx_offload_reset(struct can_rx_offload *offload); void can_rx_offload_del(struct can_rx_offload *offload); void can_rx_offload_enable(struct can_rx_offload *offload); -- cgit v1.2.3 From 4e9c9484b085dbba60b299182dd490eaeb84d18a Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Fri, 12 Jul 2019 08:02:38 +0000 Subject: can: rx-offload: Prepare for CAN FD support The skbs for classic CAN and CAN FD frames are allocated with seperate functions: alloc_can_skb() and alloc_canfd_skb(). In order to support CAN FD frames via the rx-offload helper, the driver itself has to allocate the skb (depending whether it received a classic CAN or CAN FD frame), as the rx-offload helper cannot know which kind of CAN frame the driver has received. This patch moves the allocation of the skb into the struct can_rx_offload::mailbox_read callbacks of the the flexcan and ti_hecc driver and adjusts the rx-offload helper accordingly. Signed-off-by: Joakim Zhang Signed-off-by: Marc Kleine-Budde --- include/linux/can/rx-offload.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index fc75e9a7ad2f..1b78a0cfb615 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -15,9 +15,9 @@ struct can_rx_offload { struct net_device *dev; - unsigned int (*mailbox_read)(struct can_rx_offload *offload, - struct can_frame *cf, - u32 *timestamp, unsigned int mb); + struct sk_buff *(*mailbox_read)(struct can_rx_offload *offload, + unsigned int mb, u32 *timestamp, + bool drop); struct sk_buff_head skb_queue; u32 skb_queue_len_max; -- cgit v1.2.3 From e23f568aa63f64cd6b355094224cc9356c0f696b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:29 -0800 Subject: kernfs: fix ino wrap-around detection When the 32bit ino wraps around, kernfs increments the generation number to distinguish reused ino instances. The wrap-around detection tests whether the allocated ino is lower than what the cursor but the cursor is pointing to the next ino to allocate so the condition never triggers. Fix it by remembering the last ino and comparing against that. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Fixes: 4a3ef68acacf ("kernfs: implement i_generation") Cc: Namhyung Kim Cc: stable@vger.kernel.org # v4.14+ --- include/linux/kernfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 936b61bd504e..f797ccc650e7 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -187,6 +187,7 @@ struct kernfs_root { /* private fields, do not use outside kernfs proper */ struct idr ino_idr; + u32 last_ino; u32 next_generation; struct kernfs_syscall_ops *syscall_ops; -- cgit v1.2.3 From f05499a06fb4348b935dd2ffc2c86a76705486be Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:29 -0800 Subject: writeback: use ino_t for inodes in tracepoints Writeback TPs currently use mix of 32 and 64bits for inos. This isn't currently broken because only cgroup inos are using 32bits and they're limited to 32bits. cgroup inos will make use of 64bits. Let's uniformly use ino_t. While at it, switch the default cgroup ino value used when cgroup is disabled to 1 instead of -1U as root cgroup always uses ino 1. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Reviewed-by: Greg Kroah-Hartman Cc: Jens Axboe Cc: Namhyung Kim --- include/trace/events/writeback.h | 88 ++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index c2ce6480b4b1..95e50677476b 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -61,7 +61,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(pgoff_t, index) ), @@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), @@ -150,28 +150,28 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK -static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb) +static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return wb->memcg_css->cgroup->kn->id.ino; } -static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc) +static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else - return -1U; + return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ -static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb) +static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { - return -1U; + return 1; } -static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc) +static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { - return -1U; + return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ @@ -187,8 +187,8 @@ TRACE_EVENT(inode_foreign_history, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) - __field(unsigned int, cgroup_ino) + __field(ino_t, ino) + __field(ino_t, cgroup_ino) __field(unsigned int, history) ), @@ -199,7 +199,7 @@ TRACE_EVENT(inode_foreign_history, __entry->history = history; ), - TP_printk("bdi %s: ino=%lu cgroup_ino=%u history=0x%x", + TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, __entry->ino, __entry->cgroup_ino, @@ -216,9 +216,9 @@ TRACE_EVENT(inode_switch_wbs, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) - __field(unsigned int, old_cgroup_ino) - __field(unsigned int, new_cgroup_ino) + __field(ino_t, ino) + __field(ino_t, old_cgroup_ino) + __field(ino_t, new_cgroup_ino) ), TP_fast_assign( @@ -228,7 +228,7 @@ TRACE_EVENT(inode_switch_wbs, __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), - TP_printk("bdi %s: ino=%lu old_cgroup_ino=%u new_cgroup_ino=%u", + TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, __entry->ino, __entry->old_cgroup_ino, @@ -245,10 +245,10 @@ TRACE_EVENT(track_foreign_dirty, TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned int, memcg_id) - __field(unsigned int, cgroup_ino) - __field(unsigned int, page_cgroup_ino) + __field(ino_t, cgroup_ino) + __field(ino_t, page_cgroup_ino) ), TP_fast_assign( @@ -263,7 +263,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino; ), - TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%u page_cgroup_ino=%u", + TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, __entry->ino, @@ -282,7 +282,7 @@ TRACE_EVENT(flush_foreign, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), @@ -294,7 +294,7 @@ TRACE_EVENT(flush_foreign, __entry->frn_memcg_id = frn_memcg_id; ), - TP_printk("bdi %s: cgroup_ino=%u frn_bdi_id=%u frn_memcg_id=%u", + TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, __entry->cgroup_ino, __entry->frn_bdi_id, @@ -311,9 +311,9 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(int, sync_mode) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -324,7 +324,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), - TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%u", + TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, __entry->ino, __entry->sync_mode, @@ -358,7 +358,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(int, range_cyclic) __field(int, for_background) __field(int, reason) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, @@ -374,7 +374,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " - "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%u", + "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, @@ -413,13 +413,13 @@ DECLARE_EVENT_CLASS(writeback_class, TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), - TP_printk("bdi %s: cgroup_ino=%u", + TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, __entry->cgroup_ino ) @@ -459,7 +459,7 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -478,7 +478,7 @@ DECLARE_EVENT_CLASS(wbc_class, TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " - "start=0x%lx end=0x%lx cgroup_ino=%u", + "start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, @@ -510,7 +510,7 @@ TRACE_EVENT(writeback_queue_io, __field(long, age) __field(int, moved) __field(int, reason) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; @@ -522,7 +522,7 @@ TRACE_EVENT(writeback_queue_io, __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), - TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%u", + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ @@ -596,7 +596,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -614,7 +614,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " - "balanced_dirty_ratelimit=%lu cgroup_ino=%u", + "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ @@ -660,7 +660,7 @@ TRACE_EVENT(balance_dirty_pages, __field( long, pause) __field(unsigned long, period) __field( long, think) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -692,7 +692,7 @@ TRACE_EVENT(balance_dirty_pages, "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " - "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%u", + "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, @@ -718,10 +718,10 @@ TRACE_EVENT(writeback_sb_inodes_requeue, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -733,7 +733,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue, __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), - TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%u", + TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, __entry->ino, show_inode_state(__entry->state), @@ -789,13 +789,13 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -811,7 +811,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " - "index=%lu to_write=%ld wrote=%lu cgroup_ino=%u", + "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, __entry->ino, show_inode_state(__entry->state), @@ -845,7 +845,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, TP_STRUCT__entry( __field( dev_t, dev ) - __field(unsigned long, ino ) + __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) -- cgit v1.2.3 From db53c73a8b5db120cb741d7d932cdf831a576e8f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:29 -0800 Subject: netprio: use css ID instead of cgroup ID netprio uses cgroup ID to index the priority mapping table. This is currently okay as cgroup IDs are allocated using idr and packed. However, cgroup IDs will be changed to use full 64bit range and won't be packed making this impractical. netprio doesn't care what type of IDs it uses as long as they can identify the controller instances and are packed. Let's switch to css IDs instead of cgroup IDs. Signed-off-by: Tejun Heo Acked-by: Neil Horman Reviewed-by: Greg Kroah-Hartman Cc: "David S. Miller" Cc: Namhyung Kim --- include/net/netprio_cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index cfc9441ef074..dec7522b6ce1 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -26,7 +26,7 @@ static inline u32 task_netprioidx(struct task_struct *p) rcu_read_lock(); css = task_css(p, net_prio_cgrp_id); - idx = css->cgroup->id; + idx = css->id; rcu_read_unlock(); return idx; } -- cgit v1.2.3 From 67c0496e87d193b8356d2af49ab95e8a1b954b3c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: convert kernfs_node->id from union kernfs_node_id to u64 kernfs_node->id is currently a union kernfs_node_id which represents either a 32bit (ino, gen) pair or u64 value. I can't see much value in the usage of the union - all that's needed is a 64bit ID which the current code is already limited to. Using a union makes the code unnecessarily complicated and prevents using 64bit ino without adding practical benefits. This patch drops union kernfs_node_id and makes kernfs_node->id a u64. ino is stored in the lower 32bits and gen upper. Accessors - kernfs[_id]_ino() and kernfs[_id]_gen() - are added to retrieve the ino and gen. This simplifies ID handling less cumbersome and will allow using 64bit inos on supported archs. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim Cc: Jens Axboe Cc: Alexei Starovoitov --- include/linux/cgroup.h | 17 +++++++-------- include/linux/kernfs.h | 45 ++++++++++++++++++++++++---------------- include/trace/events/writeback.h | 4 ++-- 3 files changed, 37 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f6b048902d6c..815fff49d555 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -616,7 +616,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp) /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { - return cgrp->kn->id.ino; + return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ @@ -687,13 +687,12 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } -static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +static inline u64 cgroup_get_kernfs_id(struct cgroup *cgrp) { - return &cgrp->kn->id; + return cgrp->kn->id; } -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen); +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -718,9 +717,9 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} -static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +static inline union u64 cgroup_get_kernfs_id(struct cgroup *cgrp) { - return NULL; + return 0; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) @@ -739,8 +738,8 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, return true; } -static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) {} +static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) +{} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index f797ccc650e7..b2fc5c8ef6d9 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -104,21 +104,6 @@ struct kernfs_elem_attr { struct kernfs_node *notify_next; /* for kernfs_notify() */ }; -/* represent a kernfs node */ -union kernfs_node_id { - struct { - /* - * blktrace will export this struct as a simplified 'struct - * fid' (which is a big data struction), so userspace can use - * it to find kernfs node. The layout must match the first two - * fields of 'struct fid' exactly. - */ - u32 ino; - u32 generation; - }; - u64 id; -}; - /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are @@ -155,7 +140,12 @@ struct kernfs_node { void *priv; - union kernfs_node_id id; + /* + * 64bit unique ID. Lower 32bits carry the inode number and lower + * generation. + */ + u64 id; + unsigned short flags; umode_t mode; struct kernfs_iattrs *iattr; @@ -292,6 +282,26 @@ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) return kn->flags & KERNFS_TYPE_MASK; } +static inline ino_t kernfs_id_ino(u64 id) +{ + return (u32)id; +} + +static inline u32 kernfs_id_gen(u64 id) +{ + return id >> 32; +} + +static inline ino_t kernfs_ino(struct kernfs_node *kn) +{ + return kernfs_id_ino(kn->id); +} + +static inline ino_t kernfs_gen(struct kernfs_node *kn) +{ + return kernfs_id_gen(kn->id); +} + /** * kernfs_enable_ns - enable namespace under a directory * @kn: directory of interest, should be empty @@ -383,8 +393,7 @@ void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, - const union kernfs_node_id *id); +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 95e50677476b..b4f0ffe1817e 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -152,7 +152,7 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { - return wb->memcg_css->cgroup->kn->id.ino; + return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) @@ -260,7 +260,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); - __entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino; + __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", -- cgit v1.2.3 From fe0f726c9fb626b1092a9ea3bf75f57f2eed676e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: combine ino/id lookup functions into kernfs_find_and_get_node_by_id() kernfs_find_and_get_node_by_ino() looks the kernfs_node matching the specified ino. On top of that, kernfs_get_node_by_id() and kernfs_fh_get_inode() implement full ID matching by testing the rest of ID. On surface, confusingly, the two are slightly different in that the latter uses 0 gen as wildcard while the former doesn't - does it mean that the latter can't uniquely identify inodes w/ 0 gen? In practice, this is a distinction without a difference because generation number starts at 1. There are no actual IDs with 0 gen, so it can always safely used as wildcard. Let's simplify the code by renaming kernfs_find_and_get_node_by_ino() to kernfs_find_and_get_node_by_id(), moving all lookup logics into it, and removing now unnecessary kernfs_get_node_by_id(). Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index b2fc5c8ef6d9..38267cc9420c 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -393,7 +393,8 @@ void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, u64 id); +struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, + u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) -- cgit v1.2.3 From 33c5ac9175195c36a0b7005aaf503a2e81f117a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: implement custom exportfs ops and fid type The current kernfs exportfs implementation uses the generic_fh_*() helpers and FILEID_INO32_GEN[_PARENT] which limits ino to 32bits. Let's implement custom exportfs operations and fid type to remove the restriction. * FILEID_KERNFS is a single u64 value whose content is kernfs_node->id. This is the only native fid type. * For backward compatibility with blk_log_action() path which exposes (ino,gen) pairs which userland assembles into FILEID_INO32_GEN keys, combine the generic keys into 64bit IDs in the same order. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- include/linux/exportfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cf6571fc9c01..d896b8657085 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -104,6 +104,11 @@ enum fid_type { */ FILEID_LUSTRE = 0x97, + /* + * 64 bit unique kernfs id + */ + FILEID_KERNFS = 0xfe, + /* * Filesystems must not use 0xff file ID. */ -- cgit v1.2.3 From 40430452fd5da1509177ac597b394614cd3a121f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: use 64bit inos if ino_t is 64bit Each kernfs_node is identified with a 64bit ID. The low 32bit is exposed as ino and the high gen. While this already allows using inos as keys by looking up with wildcard generation number of 0, it's adding unnecessary complications for 64bit ino archs which can directly use kernfs_node IDs as inos to uniquely identify each cgroup instance. This patch exposes IDs directly as inos on 64bit ino archs. The conversion is mostly straight-forward. * 32bit ino archs behave the same as before. 64bit ino archs now use the whole 64bit ID as ino and the generation number is fixed at 1. * 64bit inos still use the same idr allocator which gurantees that the lower 32bits identify the current live instance uniquely and the high 32bits are incremented whenever the low bits wrap. As the upper 32bits are no longer used as gen and we don't wanna start ino allocation with 33rd bit set, the initial value for highbits allocation is changed to 0 on 64bit ino archs. * blktrace exposes two 32bit numbers - (INO,GEN) pair - to identify the issuing cgroup. Userland builds FILEID_INO32_GEN fids from these numbers to look up the cgroups. To remain compatible with the behavior, always output (LOW32,HIGH32) which will be constructed back to the original 64bit ID by __kernfs_fh_to_dentry(). Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- include/linux/kernfs.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 38267cc9420c..dded2e5a9f42 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -141,8 +141,8 @@ struct kernfs_node { void *priv; /* - * 64bit unique ID. Lower 32bits carry the inode number and lower - * generation. + * 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit, + * the low 32bits are ino and upper generation. */ u64 id; @@ -177,8 +177,8 @@ struct kernfs_root { /* private fields, do not use outside kernfs proper */ struct idr ino_idr; - u32 last_ino; - u32 next_generation; + u32 last_id_lowbits; + u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_mutex */ @@ -284,12 +284,20 @@ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) static inline ino_t kernfs_id_ino(u64 id) { - return (u32)id; + /* id is ino if ino_t is 64bit; otherwise, low 32bits */ + if (sizeof(ino_t) >= sizeof(u64)) + return id; + else + return (u32)id; } static inline u32 kernfs_id_gen(u64 id) { - return id >> 32; + /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */ + if (sizeof(ino_t) >= sizeof(u64)) + return 1; + else + return id >> 32; } static inline ino_t kernfs_ino(struct kernfs_node *kn) -- cgit v1.2.3 From 743210386c0354a2f8ef3d697353c7d8477fa81d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: cgroup: use cgrp->kn->id as the cgroup ID cgroup ID is currently allocated using a dedicated per-hierarchy idr and used internally and exposed through tracepoints and bpf. This is confusing because there are tracepoints and other interfaces which use the cgroupfs ino as IDs. The preceding changes made kn->id exposed as ino as 64bit ino on supported archs or ino+gen (low 32bits as ino, high gen). There's no reason for cgroup to use different IDs. The kernfs IDs are unique and userland can easily discover them and map them back to paths using standard file operations. This patch replaces cgroup IDs with kernfs IDs. * cgroup_id() is added and all cgroup ID users are converted to use it. * kernfs_node creation is moved to earlier during cgroup init so that cgroup_id() is available during init. * While at it, s/cgroup/cgrp/ in psi helpers for consistency. * Fallback ID value is changed to 1 to be consistent with root cgroup ID. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- include/linux/cgroup-defs.h | 17 ++--------------- include/linux/cgroup.h | 17 +++++++---------- include/trace/events/cgroup.h | 6 +++--- 3 files changed, 12 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4904b1ebd1ff..63097cb243cb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -354,16 +354,6 @@ struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ - /* - * idr allocated in-hierarchy ID. - * - * ID 0 is not used, the ID of the root cgroup is always 1, and a - * new cgroup will be assigned with a smallest available ID. - * - * Allocating/Removing ID must be protected by cgroup_mutex. - */ - int id; - /* * The depth this cgroup is at. The root is at depth zero and each * step down the hierarchy increments the level. This along with @@ -488,7 +478,7 @@ struct cgroup { struct cgroup_freezer_state freezer; /* ids of the ancestors at each level including self */ - int ancestor_ids[]; + u64 ancestor_ids[]; }; /* @@ -509,7 +499,7 @@ struct cgroup_root { struct cgroup cgrp; /* for cgrp->ancestor_ids[0] */ - int cgrp_ancestor_id_storage; + u64 cgrp_ancestor_id_storage; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; @@ -520,9 +510,6 @@ struct cgroup_root { /* Hierarchy-specific flags */ unsigned int flags; - /* IDs for cgroups in this hierarchy */ - struct idr cgroup_idr; - /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 815fff49d555..d7ddebd0cdec 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -304,6 +304,11 @@ void css_task_iter_end(struct css_task_iter *it); * Inline functions. */ +static inline u64 cgroup_id(struct cgroup *cgrp) +{ + return cgrp->kn->id; +} + /** * css_get - obtain a reference on the specified css * @css: target css @@ -565,7 +570,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; - return cgrp->ancestor_ids[ancestor->level] == ancestor->id; + return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor); } /** @@ -687,17 +692,13 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } -static inline u64 cgroup_get_kernfs_id(struct cgroup *cgrp) -{ - return cgrp->kn->id; -} - void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; +static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, @@ -717,10 +718,6 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} -static inline union u64 cgroup_get_kernfs_id(struct cgroup *cgrp) -{ - return 0; -} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index a566cc521476..7f42a3de59e6 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -66,7 +66,7 @@ DECLARE_EVENT_CLASS(cgroup, TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; - __entry->id = cgrp->id; + __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); ), @@ -135,7 +135,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate, TP_fast_assign( __entry->dst_root = dst_cgrp->root->hierarchy_id; - __entry->dst_id = dst_cgrp->id; + __entry->dst_id = cgroup_id(dst_cgrp); __entry->dst_level = dst_cgrp->level; __assign_str(dst_path, path); __entry->pid = task->pid; @@ -179,7 +179,7 @@ DECLARE_EVENT_CLASS(cgroup_event, TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; - __entry->id = cgrp->id; + __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); __entry->val = val; -- cgit v1.2.3 From 6c0867022352027409f5a9fee1d3c6923f9e083e Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 12 Nov 2019 11:35:00 +0000 Subject: net: sfp: fix sfp_bus_add_upstream() warning When building with SFP disabled, the stub for sfp_bus_add_upstream() missed "inline". Add it. Fixes: 727b3668b730 ("net: sfp: rework upstream interface") Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/sfp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index c8464de7cff5..3b35efd85bb1 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -563,8 +563,8 @@ static inline struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode) return NULL; } -static int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, - const struct sfp_upstream_ops *ops) +static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, + const struct sfp_upstream_ops *ops) { return 0; } -- cgit v1.2.3 From e2cde864a1d3e3626bfc8fa088fbc82b04ce66ed Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 12 Nov 2019 14:07:49 +0200 Subject: devlink: Allow large formatted message of binary output Devlink supports pair output of name and value. When the value is binary, it must be presented in an array. If the length of the binary value exceeds fmsg limitation, break the value into chunks internally. Signed-off-by: Aya Levin Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 7891611868e4..7e72b2e71164 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -967,8 +967,6 @@ int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value); int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value); int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value); int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value); -int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len); int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, bool value); @@ -981,7 +979,7 @@ int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, const char *value); int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u16 value_len); + const void *value, u32 value_len); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, -- cgit v1.2.3 From 15d0b22c01e6320241fe4d570e02de2935b842bf Mon Sep 17 00:00:00 2001 From: Jerry Snitselaar Date: Mon, 2 Sep 2019 07:27:34 -0700 Subject: tpm: provide a way to override the chip returned durations Patch adds method ->update_durations to override returned durations in case TPM chip misbehaves for TPM 1.2 drivers. Cc: Peter Huewe Cc: Jason Gunthorpe Signed-off-by: Alexey Klimov Signed-off-by: Jerry Snitselaar Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen (!update_durations path) Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 53c0ea9ec9df..bb1d1ac7081d 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -67,6 +67,8 @@ struct tpm_class_ops { u8 (*status) (struct tpm_chip *chip); void (*update_timeouts)(struct tpm_chip *chip, unsigned long *timeout_cap); + void (*update_durations)(struct tpm_chip *chip, + unsigned long *duration_cap); int (*go_idle)(struct tpm_chip *chip); int (*cmd_ready)(struct tpm_chip *chip); int (*request_locality)(struct tpm_chip *chip, int loc); -- cgit v1.2.3 From 74edff2d74c64ca5977a57efb5c238c8f5318ba9 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Wed, 16 Oct 2019 10:44:52 +0530 Subject: tpm: Move tpm_buf code to include/linux/ Move tpm_buf code to common include/linux/tpm.h header so that it can be reused via other subsystems like trusted keys etc. Also rename trusted keys and asymmetric keys usage of TPM 1.x buffer implementation to tpm1_buf to avoid any compilation errors. Suggested-by: Jarkko Sakkinen Signed-off-by: Sumit Garg Reviewed-by: Jerry Snitselaar Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/keys/trusted.h | 12 +-- include/linux/tpm.h | 212 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/keys/trusted.h b/include/keys/trusted.h index 0071298b9b28..841ae111c976 100644 --- a/include/keys/trusted.h +++ b/include/keys/trusted.h @@ -17,7 +17,7 @@ #define LOAD32N(buffer, offset) (*(uint32_t *)&buffer[offset]) #define LOAD16(buffer, offset) (ntohs(*(uint16_t *)&buffer[offset])) -struct tpm_buf { +struct tpm1_buf { int len; unsigned char data[MAX_BUF_SIZE]; }; @@ -46,7 +46,7 @@ int TSS_checkhmac1(unsigned char *buffer, unsigned int keylen, ...); int trusted_tpm_send(unsigned char *cmd, size_t buflen); -int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce); +int oiap(struct tpm1_buf *tb, uint32_t *handle, unsigned char *nonce); #define TPM_DEBUG 0 @@ -110,24 +110,24 @@ static inline void dump_tpm_buf(unsigned char *buf) } #endif -static inline void store8(struct tpm_buf *buf, const unsigned char value) +static inline void store8(struct tpm1_buf *buf, const unsigned char value) { buf->data[buf->len++] = value; } -static inline void store16(struct tpm_buf *buf, const uint16_t value) +static inline void store16(struct tpm1_buf *buf, const uint16_t value) { *(uint16_t *) & buf->data[buf->len] = htons(value); buf->len += sizeof value; } -static inline void store32(struct tpm_buf *buf, const uint32_t value) +static inline void store32(struct tpm1_buf *buf, const uint32_t value) { *(uint32_t *) & buf->data[buf->len] = htonl(value); buf->len += sizeof value; } -static inline void storebytes(struct tpm_buf *buf, const unsigned char *in, +static inline void storebytes(struct tpm1_buf *buf, const unsigned char *in, const int len) { memcpy(buf->data + buf->len, in, len); diff --git a/include/linux/tpm.h b/include/linux/tpm.h index bb1d1ac7081d..c78119fcac7f 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #define TPM_DIGEST_SIZE 20 /* Max TPM v1.2 PCR size */ @@ -163,6 +164,217 @@ struct tpm_chip { int locality; }; +#define TPM_HEADER_SIZE 10 + +enum tpm2_const { + TPM2_PLATFORM_PCR = 24, + TPM2_PCR_SELECT_MIN = ((TPM2_PLATFORM_PCR + 7) / 8), +}; + +enum tpm2_timeouts { + TPM2_TIMEOUT_A = 750, + TPM2_TIMEOUT_B = 2000, + TPM2_TIMEOUT_C = 200, + TPM2_TIMEOUT_D = 30, + TPM2_DURATION_SHORT = 20, + TPM2_DURATION_MEDIUM = 750, + TPM2_DURATION_LONG = 2000, + TPM2_DURATION_LONG_LONG = 300000, + TPM2_DURATION_DEFAULT = 120000, +}; + +enum tpm2_structures { + TPM2_ST_NO_SESSIONS = 0x8001, + TPM2_ST_SESSIONS = 0x8002, +}; + +/* Indicates from what layer of the software stack the error comes from */ +#define TSS2_RC_LAYER_SHIFT 16 +#define TSS2_RESMGR_TPM_RC_LAYER (11 << TSS2_RC_LAYER_SHIFT) + +enum tpm2_return_codes { + TPM2_RC_SUCCESS = 0x0000, + TPM2_RC_HASH = 0x0083, /* RC_FMT1 */ + TPM2_RC_HANDLE = 0x008B, + TPM2_RC_INITIALIZE = 0x0100, /* RC_VER1 */ + TPM2_RC_FAILURE = 0x0101, + TPM2_RC_DISABLED = 0x0120, + TPM2_RC_COMMAND_CODE = 0x0143, + TPM2_RC_TESTING = 0x090A, /* RC_WARN */ + TPM2_RC_REFERENCE_H0 = 0x0910, + TPM2_RC_RETRY = 0x0922, +}; + +enum tpm2_command_codes { + TPM2_CC_FIRST = 0x011F, + TPM2_CC_HIERARCHY_CONTROL = 0x0121, + TPM2_CC_HIERARCHY_CHANGE_AUTH = 0x0129, + TPM2_CC_CREATE_PRIMARY = 0x0131, + TPM2_CC_SEQUENCE_COMPLETE = 0x013E, + TPM2_CC_SELF_TEST = 0x0143, + TPM2_CC_STARTUP = 0x0144, + TPM2_CC_SHUTDOWN = 0x0145, + TPM2_CC_NV_READ = 0x014E, + TPM2_CC_CREATE = 0x0153, + TPM2_CC_LOAD = 0x0157, + TPM2_CC_SEQUENCE_UPDATE = 0x015C, + TPM2_CC_UNSEAL = 0x015E, + TPM2_CC_CONTEXT_LOAD = 0x0161, + TPM2_CC_CONTEXT_SAVE = 0x0162, + TPM2_CC_FLUSH_CONTEXT = 0x0165, + TPM2_CC_VERIFY_SIGNATURE = 0x0177, + TPM2_CC_GET_CAPABILITY = 0x017A, + TPM2_CC_GET_RANDOM = 0x017B, + TPM2_CC_PCR_READ = 0x017E, + TPM2_CC_PCR_EXTEND = 0x0182, + TPM2_CC_EVENT_SEQUENCE_COMPLETE = 0x0185, + TPM2_CC_HASH_SEQUENCE_START = 0x0186, + TPM2_CC_CREATE_LOADED = 0x0191, + TPM2_CC_LAST = 0x0193, /* Spec 1.36 */ +}; + +enum tpm2_permanent_handles { + TPM2_RS_PW = 0x40000009, +}; + +enum tpm2_capabilities { + TPM2_CAP_HANDLES = 1, + TPM2_CAP_COMMANDS = 2, + TPM2_CAP_PCRS = 5, + TPM2_CAP_TPM_PROPERTIES = 6, +}; + +enum tpm2_properties { + TPM_PT_TOTAL_COMMANDS = 0x0129, +}; + +enum tpm2_startup_types { + TPM2_SU_CLEAR = 0x0000, + TPM2_SU_STATE = 0x0001, +}; + +enum tpm2_cc_attrs { + TPM2_CC_ATTR_CHANDLES = 25, + TPM2_CC_ATTR_RHANDLE = 28, +}; + +#define TPM_VID_INTEL 0x8086 +#define TPM_VID_WINBOND 0x1050 +#define TPM_VID_STM 0x104A + +enum tpm_chip_flags { + TPM_CHIP_FLAG_TPM2 = BIT(1), + TPM_CHIP_FLAG_IRQ = BIT(2), + TPM_CHIP_FLAG_VIRTUAL = BIT(3), + TPM_CHIP_FLAG_HAVE_TIMEOUTS = BIT(4), + TPM_CHIP_FLAG_ALWAYS_POWERED = BIT(5), + TPM_CHIP_FLAG_FIRMWARE_POWER_MANAGED = BIT(6), +}; + +#define to_tpm_chip(d) container_of(d, struct tpm_chip, dev) + +struct tpm_header { + __be16 tag; + __be32 length; + union { + __be32 ordinal; + __be32 return_code; + }; +} __packed; + +/* A string buffer type for constructing TPM commands. This is based on the + * ideas of string buffer code in security/keys/trusted.h but is heap based + * in order to keep the stack usage minimal. + */ + +enum tpm_buf_flags { + TPM_BUF_OVERFLOW = BIT(0), +}; + +struct tpm_buf { + unsigned int flags; + u8 *data; +}; + +static inline void tpm_buf_reset(struct tpm_buf *buf, u16 tag, u32 ordinal) +{ + struct tpm_header *head = (struct tpm_header *)buf->data; + + head->tag = cpu_to_be16(tag); + head->length = cpu_to_be32(sizeof(*head)); + head->ordinal = cpu_to_be32(ordinal); +} + +static inline int tpm_buf_init(struct tpm_buf *buf, u16 tag, u32 ordinal) +{ + buf->data = (u8 *)__get_free_page(GFP_KERNEL); + if (!buf->data) + return -ENOMEM; + + buf->flags = 0; + tpm_buf_reset(buf, tag, ordinal); + return 0; +} + +static inline void tpm_buf_destroy(struct tpm_buf *buf) +{ + free_page((unsigned long)buf->data); +} + +static inline u32 tpm_buf_length(struct tpm_buf *buf) +{ + struct tpm_header *head = (struct tpm_header *)buf->data; + + return be32_to_cpu(head->length); +} + +static inline u16 tpm_buf_tag(struct tpm_buf *buf) +{ + struct tpm_header *head = (struct tpm_header *)buf->data; + + return be16_to_cpu(head->tag); +} + +static inline void tpm_buf_append(struct tpm_buf *buf, + const unsigned char *new_data, + unsigned int new_len) +{ + struct tpm_header *head = (struct tpm_header *)buf->data; + u32 len = tpm_buf_length(buf); + + /* Return silently if overflow has already happened. */ + if (buf->flags & TPM_BUF_OVERFLOW) + return; + + if ((len + new_len) > PAGE_SIZE) { + WARN(1, "tpm_buf: overflow\n"); + buf->flags |= TPM_BUF_OVERFLOW; + return; + } + + memcpy(&buf->data[len], new_data, new_len); + head->length = cpu_to_be32(len + new_len); +} + +static inline void tpm_buf_append_u8(struct tpm_buf *buf, const u8 value) +{ + tpm_buf_append(buf, &value, 1); +} + +static inline void tpm_buf_append_u16(struct tpm_buf *buf, const u16 value) +{ + __be16 value2 = cpu_to_be16(value); + + tpm_buf_append(buf, (u8 *) &value2, 2); +} + +static inline void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value) +{ + __be32 value2 = cpu_to_be32(value); + + tpm_buf_append(buf, (u8 *) &value2, 4); +} + #if defined(CONFIG_TCG_TPM) || defined(CONFIG_TCG_TPM_MODULE) extern int tpm_is_tpm2(struct tpm_chip *chip); -- cgit v1.2.3 From c6f61e59760df04a245ef48c8805b4eb3d958230 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Wed, 16 Oct 2019 10:44:53 +0530 Subject: KEYS: Use common tpm_buf for trusted and asymmetric keys Switch to utilize common heap based tpm_buf code for TPM based trusted and asymmetric keys rather than using stack based tpm1_buf code. Also, remove tpm1_buf code. Suggested-by: Jarkko Sakkinen Signed-off-by: Sumit Garg Reviewed-by: Jerry Snitselaar Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/keys/trusted.h | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) (limited to 'include') diff --git a/include/keys/trusted.h b/include/keys/trusted.h index 841ae111c976..29e3e9ba8e1a 100644 --- a/include/keys/trusted.h +++ b/include/keys/trusted.h @@ -5,10 +5,6 @@ /* implementation specific TPM constants */ #define MAX_BUF_SIZE 1024 #define TPM_GETRANDOM_SIZE 14 -#define TPM_OSAP_SIZE 36 -#define TPM_OIAP_SIZE 10 -#define TPM_SEAL_SIZE 87 -#define TPM_UNSEAL_SIZE 104 #define TPM_SIZE_OFFSET 2 #define TPM_RETURN_OFFSET 6 #define TPM_DATA_OFFSET 10 @@ -17,13 +13,6 @@ #define LOAD32N(buffer, offset) (*(uint32_t *)&buffer[offset]) #define LOAD16(buffer, offset) (ntohs(*(uint16_t *)&buffer[offset])) -struct tpm1_buf { - int len; - unsigned char data[MAX_BUF_SIZE]; -}; - -#define INIT_BUF(tb) (tb->len = 0) - struct osapsess { uint32_t handle; unsigned char secret[SHA1_DIGEST_SIZE]; @@ -46,7 +35,7 @@ int TSS_checkhmac1(unsigned char *buffer, unsigned int keylen, ...); int trusted_tpm_send(unsigned char *cmd, size_t buflen); -int oiap(struct tpm1_buf *tb, uint32_t *handle, unsigned char *nonce); +int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce); #define TPM_DEBUG 0 @@ -109,28 +98,4 @@ static inline void dump_tpm_buf(unsigned char *buf) { } #endif - -static inline void store8(struct tpm1_buf *buf, const unsigned char value) -{ - buf->data[buf->len++] = value; -} - -static inline void store16(struct tpm1_buf *buf, const uint16_t value) -{ - *(uint16_t *) & buf->data[buf->len] = htons(value); - buf->len += sizeof value; -} - -static inline void store32(struct tpm1_buf *buf, const uint32_t value) -{ - *(uint32_t *) & buf->data[buf->len] = htonl(value); - buf->len += sizeof value; -} - -static inline void storebytes(struct tpm1_buf *buf, const unsigned char *in, - const int len) -{ - memcpy(buf->data + buf->len, in, len); - buf->len += len; -} #endif -- cgit v1.2.3 From 47f9c279689107f306fff506753971a39a8a7ffc Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Wed, 16 Oct 2019 10:44:54 +0530 Subject: KEYS: trusted: Create trusted keys subsystem Move existing code to trusted keys subsystem. Also, rename files with "tpm" as suffix which provides the underlying implementation. Suggested-by: Jarkko Sakkinen Signed-off-by: Sumit Garg Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/Kbuild | 1 - include/keys/trusted.h | 101 ------------------------------------------- include/keys/trusted_tpm.h | 104 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 102 deletions(-) delete mode 100644 include/keys/trusted.h create mode 100644 include/keys/trusted_tpm.h (limited to 'include') diff --git a/include/Kbuild b/include/Kbuild index ffba79483cc5..6f9ec5a64ec5 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -65,7 +65,6 @@ header-test- += keys/asymmetric-subtype.h header-test- += keys/asymmetric-type.h header-test- += keys/big_key-type.h header-test- += keys/request_key_auth-type.h -header-test- += keys/trusted.h header-test- += kvm/arm_arch_timer.h header-test- += kvm/arm_pmu.h header-test-$(CONFIG_ARM) += kvm/arm_psci.h diff --git a/include/keys/trusted.h b/include/keys/trusted.h deleted file mode 100644 index 29e3e9ba8e1a..000000000000 --- a/include/keys/trusted.h +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __TRUSTED_KEY_H -#define __TRUSTED_KEY_H - -/* implementation specific TPM constants */ -#define MAX_BUF_SIZE 1024 -#define TPM_GETRANDOM_SIZE 14 -#define TPM_SIZE_OFFSET 2 -#define TPM_RETURN_OFFSET 6 -#define TPM_DATA_OFFSET 10 - -#define LOAD32(buffer, offset) (ntohl(*(uint32_t *)&buffer[offset])) -#define LOAD32N(buffer, offset) (*(uint32_t *)&buffer[offset]) -#define LOAD16(buffer, offset) (ntohs(*(uint16_t *)&buffer[offset])) - -struct osapsess { - uint32_t handle; - unsigned char secret[SHA1_DIGEST_SIZE]; - unsigned char enonce[TPM_NONCE_SIZE]; -}; - -/* discrete values, but have to store in uint16_t for TPM use */ -enum { - SEAL_keytype = 1, - SRK_keytype = 4 -}; - -int TSS_authhmac(unsigned char *digest, const unsigned char *key, - unsigned int keylen, unsigned char *h1, - unsigned char *h2, unsigned int h3, ...); -int TSS_checkhmac1(unsigned char *buffer, - const uint32_t command, - const unsigned char *ononce, - const unsigned char *key, - unsigned int keylen, ...); - -int trusted_tpm_send(unsigned char *cmd, size_t buflen); -int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce); - -#define TPM_DEBUG 0 - -#if TPM_DEBUG -static inline void dump_options(struct trusted_key_options *o) -{ - pr_info("trusted_key: sealing key type %d\n", o->keytype); - pr_info("trusted_key: sealing key handle %0X\n", o->keyhandle); - pr_info("trusted_key: pcrlock %d\n", o->pcrlock); - pr_info("trusted_key: pcrinfo %d\n", o->pcrinfo_len); - print_hex_dump(KERN_INFO, "pcrinfo ", DUMP_PREFIX_NONE, - 16, 1, o->pcrinfo, o->pcrinfo_len, 0); -} - -static inline void dump_payload(struct trusted_key_payload *p) -{ - pr_info("trusted_key: key_len %d\n", p->key_len); - print_hex_dump(KERN_INFO, "key ", DUMP_PREFIX_NONE, - 16, 1, p->key, p->key_len, 0); - pr_info("trusted_key: bloblen %d\n", p->blob_len); - print_hex_dump(KERN_INFO, "blob ", DUMP_PREFIX_NONE, - 16, 1, p->blob, p->blob_len, 0); - pr_info("trusted_key: migratable %d\n", p->migratable); -} - -static inline void dump_sess(struct osapsess *s) -{ - print_hex_dump(KERN_INFO, "trusted-key: handle ", DUMP_PREFIX_NONE, - 16, 1, &s->handle, 4, 0); - pr_info("trusted-key: secret:\n"); - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, - 16, 1, &s->secret, SHA1_DIGEST_SIZE, 0); - pr_info("trusted-key: enonce:\n"); - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, - 16, 1, &s->enonce, SHA1_DIGEST_SIZE, 0); -} - -static inline void dump_tpm_buf(unsigned char *buf) -{ - int len; - - pr_info("\ntrusted-key: tpm buffer\n"); - len = LOAD32(buf, TPM_SIZE_OFFSET); - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1, buf, len, 0); -} -#else -static inline void dump_options(struct trusted_key_options *o) -{ -} - -static inline void dump_payload(struct trusted_key_payload *p) -{ -} - -static inline void dump_sess(struct osapsess *s) -{ -} - -static inline void dump_tpm_buf(unsigned char *buf) -{ -} -#endif -#endif diff --git a/include/keys/trusted_tpm.h b/include/keys/trusted_tpm.h new file mode 100644 index 000000000000..7b9d7b450a9e --- /dev/null +++ b/include/keys/trusted_tpm.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TRUSTED_TPM_H +#define __TRUSTED_TPM_H + +#include +#include + +/* implementation specific TPM constants */ +#define MAX_BUF_SIZE 1024 +#define TPM_GETRANDOM_SIZE 14 +#define TPM_SIZE_OFFSET 2 +#define TPM_RETURN_OFFSET 6 +#define TPM_DATA_OFFSET 10 + +#define LOAD32(buffer, offset) (ntohl(*(uint32_t *)&buffer[offset])) +#define LOAD32N(buffer, offset) (*(uint32_t *)&buffer[offset]) +#define LOAD16(buffer, offset) (ntohs(*(uint16_t *)&buffer[offset])) + +struct osapsess { + uint32_t handle; + unsigned char secret[SHA1_DIGEST_SIZE]; + unsigned char enonce[TPM_NONCE_SIZE]; +}; + +/* discrete values, but have to store in uint16_t for TPM use */ +enum { + SEAL_keytype = 1, + SRK_keytype = 4 +}; + +int TSS_authhmac(unsigned char *digest, const unsigned char *key, + unsigned int keylen, unsigned char *h1, + unsigned char *h2, unsigned int h3, ...); +int TSS_checkhmac1(unsigned char *buffer, + const uint32_t command, + const unsigned char *ononce, + const unsigned char *key, + unsigned int keylen, ...); + +int trusted_tpm_send(unsigned char *cmd, size_t buflen); +int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce); + +#define TPM_DEBUG 0 + +#if TPM_DEBUG +static inline void dump_options(struct trusted_key_options *o) +{ + pr_info("trusted_key: sealing key type %d\n", o->keytype); + pr_info("trusted_key: sealing key handle %0X\n", o->keyhandle); + pr_info("trusted_key: pcrlock %d\n", o->pcrlock); + pr_info("trusted_key: pcrinfo %d\n", o->pcrinfo_len); + print_hex_dump(KERN_INFO, "pcrinfo ", DUMP_PREFIX_NONE, + 16, 1, o->pcrinfo, o->pcrinfo_len, 0); +} + +static inline void dump_payload(struct trusted_key_payload *p) +{ + pr_info("trusted_key: key_len %d\n", p->key_len); + print_hex_dump(KERN_INFO, "key ", DUMP_PREFIX_NONE, + 16, 1, p->key, p->key_len, 0); + pr_info("trusted_key: bloblen %d\n", p->blob_len); + print_hex_dump(KERN_INFO, "blob ", DUMP_PREFIX_NONE, + 16, 1, p->blob, p->blob_len, 0); + pr_info("trusted_key: migratable %d\n", p->migratable); +} + +static inline void dump_sess(struct osapsess *s) +{ + print_hex_dump(KERN_INFO, "trusted-key: handle ", DUMP_PREFIX_NONE, + 16, 1, &s->handle, 4, 0); + pr_info("trusted-key: secret:\n"); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, + 16, 1, &s->secret, SHA1_DIGEST_SIZE, 0); + pr_info("trusted-key: enonce:\n"); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, + 16, 1, &s->enonce, SHA1_DIGEST_SIZE, 0); +} + +static inline void dump_tpm_buf(unsigned char *buf) +{ + int len; + + pr_info("\ntrusted-key: tpm buffer\n"); + len = LOAD32(buf, TPM_SIZE_OFFSET); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1, buf, len, 0); +} +#else +static inline void dump_options(struct trusted_key_options *o) +{ +} + +static inline void dump_payload(struct trusted_key_payload *p) +{ +} + +static inline void dump_sess(struct osapsess *s) +{ +} + +static inline void dump_tpm_buf(unsigned char *buf) +{ +} +#endif +#endif -- cgit v1.2.3 From 2e19e10131a08dc65079c755fb6e8af936bfedbd Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Wed, 16 Oct 2019 10:44:55 +0530 Subject: KEYS: trusted: Move TPM2 trusted keys code Move TPM2 trusted keys code to trusted keys subsystem. The reason being it's better to consolidate all the trusted keys code to a single location so that it can be maintained sanely. Also, utilize existing tpm_send() exported API which wraps the internal tpm_transmit_cmd() API. Suggested-by: Jarkko Sakkinen Signed-off-by: Sumit Garg Reviewed-by: Jarkko Sakkinen Tested-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/keys/trusted_tpm.h | 7 +++++++ include/linux/tpm.h | 36 ++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/keys/trusted_tpm.h b/include/keys/trusted_tpm.h index 7b9d7b450a9e..a56d8e1298f2 100644 --- a/include/keys/trusted_tpm.h +++ b/include/keys/trusted_tpm.h @@ -40,6 +40,13 @@ int TSS_checkhmac1(unsigned char *buffer, int trusted_tpm_send(unsigned char *cmd, size_t buflen); int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce); +int tpm2_seal_trusted(struct tpm_chip *chip, + struct trusted_key_payload *payload, + struct trusted_key_options *options); +int tpm2_unseal_trusted(struct tpm_chip *chip, + struct trusted_key_payload *payload, + struct trusted_key_options *options); + #define TPM_DEBUG 0 #if TPM_DEBUG diff --git a/include/linux/tpm.h b/include/linux/tpm.h index c78119fcac7f..0d6e949ba315 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -296,6 +296,19 @@ struct tpm_buf { u8 *data; }; +enum tpm2_object_attributes { + TPM2_OA_USER_WITH_AUTH = BIT(6), +}; + +enum tpm2_session_attributes { + TPM2_SA_CONTINUE_SESSION = BIT(0), +}; + +struct tpm2_hash { + unsigned int crypto_id; + unsigned int tpm_id; +}; + static inline void tpm_buf_reset(struct tpm_buf *buf, u16 tag, u32 ordinal) { struct tpm_header *head = (struct tpm_header *)buf->data; @@ -375,6 +388,11 @@ static inline void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value) tpm_buf_append(buf, (u8 *) &value2, 4); } +static inline u32 tpm2_rc_value(u32 rc) +{ + return (rc & BIT(7)) ? rc & 0xff : rc; +} + #if defined(CONFIG_TCG_TPM) || defined(CONFIG_TCG_TPM_MODULE) extern int tpm_is_tpm2(struct tpm_chip *chip); @@ -384,12 +402,6 @@ extern int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, struct tpm_digest *digests); extern int tpm_send(struct tpm_chip *chip, void *cmd, size_t buflen); extern int tpm_get_random(struct tpm_chip *chip, u8 *data, size_t max); -extern int tpm_seal_trusted(struct tpm_chip *chip, - struct trusted_key_payload *payload, - struct trusted_key_options *options); -extern int tpm_unseal_trusted(struct tpm_chip *chip, - struct trusted_key_payload *payload, - struct trusted_key_options *options); extern struct tpm_chip *tpm_default_chip(void); #else static inline int tpm_is_tpm2(struct tpm_chip *chip) @@ -418,18 +430,6 @@ static inline int tpm_get_random(struct tpm_chip *chip, u8 *data, size_t max) return -ENODEV; } -static inline int tpm_seal_trusted(struct tpm_chip *chip, - struct trusted_key_payload *payload, - struct trusted_key_options *options) -{ - return -ENODEV; -} -static inline int tpm_unseal_trusted(struct tpm_chip *chip, - struct trusted_key_payload *payload, - struct trusted_key_options *options) -{ - return -ENODEV; -} static inline struct tpm_chip *tpm_default_chip(void) { return NULL; -- cgit v1.2.3 From e0e2b35b790fefbcff5689984a134cdaa4ce051c Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 12 Nov 2019 15:33:11 +0100 Subject: net/sched: actions: remove unused 'order' after commit 4097e9d250fb ("net: sched: don't use tc_action->order during action dump"), 'act->order' is initialized but then it's no more read, so we can just remove this member of struct tc_action. CC: Ivan Vecera Signed-off-by: Davide Caratti Acked-by: Jiri Pirko Reviewed-by: Ivan Vecera Signed-off-by: David S. Miller --- include/net/act_api.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 0495bdc034d2..71347a90a9d1 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -23,7 +23,6 @@ struct tc_action_ops; struct tc_action { const struct tc_action_ops *ops; __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ - __u32 order; struct tcf_idrinfo *idrinfo; u32 tcfa_index; -- cgit v1.2.3 From d41003513e61dd9d4974cb441d30b63650b85654 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 Nov 2019 11:39:30 +0900 Subject: block: rework zone reporting Avoid the need to allocate a potentially large array of struct blk_zone in the block layer by switching the ->report_zones method interface to a callback model. Now the caller simply supplies a callback that is executed on each reported zone, and private data for it. Signed-off-by: Christoph Hellwig Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 15 +++++++-------- include/linux/device-mapper.h | 24 +++++++++++++++++++----- 2 files changed, 26 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a4f7abbdcf7..397bb9bc230b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -349,17 +349,16 @@ struct queue_limits { enum blk_zoned_model zoned; }; +typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, + void *data); + #ifdef CONFIG_BLK_DEV_ZONED -/* - * Maximum number of zones to report with a single report zones command. - */ -#define BLK_ZONED_REPORT_MAX_ZONES 8192U +#define BLK_ALL_ZONES ((unsigned int)-1) +int blkdev_report_zones(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); extern unsigned int blkdev_nr_zones(struct block_device *bdev); -extern int blkdev_report_zones(struct block_device *bdev, - sector_t sector, struct blk_zone *zones, - unsigned int *nr_zones); extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); @@ -1709,7 +1708,7 @@ struct block_device_operations { /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones); + unsigned int nr_zones, report_zones_cb cb, void *data); struct module *owner; const struct pr_ops *pr_ops; }; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 399ad8632356..a164cc81b710 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -17,6 +17,7 @@ struct dm_dev; struct dm_target; struct dm_table; +struct dm_report_zones_args; struct mapped_device; struct bio_vec; @@ -93,9 +94,9 @@ typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv, typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev); -typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector, - struct blk_zone *zones, - unsigned int *nr_zones); +typedef int (*dm_report_zones_fn) (struct dm_target *ti, + struct dm_report_zones_args *args, + unsigned int nr_zones); /* * These iteration functions are typically used to check (and combine) @@ -422,10 +423,23 @@ struct gendisk *dm_disk(struct mapped_device *md); int dm_suspended(struct dm_target *ti); int dm_noflush_suspending(struct dm_target *ti); void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors); -void dm_remap_zone_report(struct dm_target *ti, sector_t start, - struct blk_zone *zones, unsigned int *nr_zones); union map_info *dm_get_rq_mapinfo(struct request *rq); +#ifdef CONFIG_BLK_DEV_ZONED +struct dm_report_zones_args { + struct dm_target *tgt; + sector_t next_sector; + + void *orig_data; + report_zones_cb orig_cb; + unsigned int zone_idx; + + /* must be filled by ->report_zones before calling dm_report_zones_cb */ + sector_t start; +}; +int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data); +#endif /* CONFIG_BLK_DEV_ZONED */ + /* * Device mapper functions to parse and create devices specified by the * parameter "dm-mod.create=" -- cgit v1.2.3 From b32d2f341623765f525b1a559aa1758599ed7094 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:51 +0100 Subject: netfilter: nf_flow_table: move conntrack object to struct flow_offload Simplify this code by storing the pointer to conntrack object in the flow_offload structure. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 158514281a75..88c8cd248213 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -72,6 +72,7 @@ struct flow_offload_tuple_rhash { struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; + struct nf_conn *ct; u32 flags; union { /* Your private driver data here. */ -- cgit v1.2.3 From 9f48e9bf253aa292dbf10f173f6f4c02d0349f45 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:52 +0100 Subject: netfilter: nf_flow_table: remove union from flow_offload structure Drivers do not have access to the flow_offload structure, hence remove this union from this flow_offload object as well as the original comment on top of it. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 88c8cd248213..7f892d6c1a6d 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -74,10 +74,7 @@ struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; struct nf_conn *ct; u32 flags; - union { - /* Your private driver data here. */ - u32 timeout; - }; + u32 timeout; }; #define NF_FLOW_TIMEOUT (30 * HZ) -- cgit v1.2.3 From 62248df88a406a443b838a3633a7f60a716f999e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:53 +0100 Subject: netfilter: nf_flowtable: remove flow_offload_entry structure Move rcu_head to struct flow_offload, then remove the flow_offload_entry structure definition. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 7f892d6c1a6d..6d33734c8fa1 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -75,6 +75,7 @@ struct flow_offload { struct nf_conn *ct; u32 flags; u32 timeout; + struct rcu_head rcu_head; }; #define NF_FLOW_TIMEOUT (30 * HZ) -- cgit v1.2.3 From f1363e058b84e61d39f9796fa806090ad7a28ebd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:54 +0100 Subject: netfilter: nf_flow_table: detach routing information from flow description This patch adds the infrastructure to support for flow entry types. The initial type is NF_FLOW_OFFLOAD_ROUTE that stores the routing information into the flow entry to define a fastpath for the classic forwarding path. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 6d33734c8fa1..f000e8917487 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -70,10 +70,16 @@ struct flow_offload_tuple_rhash { #define FLOW_OFFLOAD_DYING 0x4 #define FLOW_OFFLOAD_TEARDOWN 0x8 +enum flow_offload_type { + NF_FLOW_OFFLOAD_UNSPEC = 0, + NF_FLOW_OFFLOAD_ROUTE, +}; + struct flow_offload { struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; struct nf_conn *ct; - u32 flags; + u16 flags; + u16 type; u32 timeout; struct rcu_head rcu_head; }; @@ -86,10 +92,12 @@ struct nf_flow_route { } tuple[FLOW_OFFLOAD_DIR_MAX]; }; -struct flow_offload *flow_offload_alloc(struct nf_conn *ct, - struct nf_flow_route *route); +struct flow_offload *flow_offload_alloc(struct nf_conn *ct); void flow_offload_free(struct flow_offload *flow); +int flow_offload_route_init(struct flow_offload *flow, + const struct nf_flow_route *route); + int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple); -- cgit v1.2.3 From 8bb69f3b2918788435cbd5834c66682642c09fba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:55 +0100 Subject: netfilter: nf_tables: add flowtable offload control plane This patch adds the NFTA_FLOWTABLE_FLAGS attribute that allows users to specify the NF_FLOWTABLE_HW_OFFLOAD flag. This patch also adds a new setup interface for the flowtable type to perform the flowtable offload block callback configuration. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 18 ++++++++++++++++++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index f000e8917487..ece09d36c7a6 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -8,6 +8,7 @@ #include #include #include +#include #include struct nf_flowtable; @@ -16,17 +17,27 @@ struct nf_flowtable_type { struct list_head list; int family; int (*init)(struct nf_flowtable *ft); + int (*setup)(struct nf_flowtable *ft, + struct net_device *dev, + enum flow_block_command cmd); void (*free)(struct nf_flowtable *ft); nf_hookfn *hook; struct module *owner; }; +enum nf_flowtable_flags { + NF_FLOWTABLE_HW_OFFLOAD = 0x1, +}; + struct nf_flowtable { struct list_head list; struct rhashtable rhashtable; int priority; const struct nf_flowtable_type *type; struct delayed_work gc_work; + unsigned int flags; + struct flow_block flow_block; + possible_net_t net; }; enum flow_offload_tuple_dir { @@ -131,4 +142,11 @@ unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, #define MODULE_ALIAS_NF_FLOWTABLE(family) \ MODULE_ALIAS("nf-flowtable-" __stringify(family)) +static inline int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + return 0; +} + #endif /* _NF_FLOW_TABLE_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 81fed16fe2b2..bb9b049310df 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1518,6 +1518,7 @@ enum nft_object_attributes { * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32) * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32) * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64) + * @NFTA_FLOWTABLE_FLAGS: flags (NLA_U32) */ enum nft_flowtable_attributes { NFTA_FLOWTABLE_UNSPEC, @@ -1527,6 +1528,7 @@ enum nft_flowtable_attributes { NFTA_FLOWTABLE_USE, NFTA_FLOWTABLE_HANDLE, NFTA_FLOWTABLE_PAD, + NFTA_FLOWTABLE_FLAGS, __NFTA_FLOWTABLE_MAX }; #define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1) -- cgit v1.2.3 From c29f74e0df7a02b8303bcdce93a7c0132d62577a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:56 +0100 Subject: netfilter: nf_flow_table: hardware offload support This patch adds the dataplane hardware offload to the flowtable infrastructure. Three new flags represent the hardware state of this flow: * FLOW_OFFLOAD_HW: This flow entry resides in the hardware. * FLOW_OFFLOAD_HW_DYING: This flow entry has been scheduled to be remove from hardware. This might be triggered by either packet path (via TCP RST/FIN packet) or via aging. * FLOW_OFFLOAD_HW_DEAD: This flow entry has been already removed from the hardware, the software garbage collector can remove it from the software flowtable. This patch supports for: * IPv4 only. * Aging via FLOW_CLS_STATS, no packet and byte counter synchronization at this stage. This patch also adds the action callback that specifies how to convert the flow entry into the flow_rule object that is passed to the driver. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/netfilter/nf_flow_table.h | 33 +++++++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f857f01234f7..9e6fb8524d91 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -848,6 +848,7 @@ enum tc_setup_type { TC_SETUP_ROOT_QDISC, TC_SETUP_QDISC_GRED, TC_SETUP_QDISC_TAPRIO, + TC_SETUP_FT, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index ece09d36c7a6..eea66de328d3 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -12,6 +12,9 @@ #include struct nf_flowtable; +struct nf_flow_rule; +struct flow_offload; +enum flow_offload_tuple_dir; struct nf_flowtable_type { struct list_head list; @@ -20,6 +23,10 @@ struct nf_flowtable_type { int (*setup)(struct nf_flowtable *ft, struct net_device *dev, enum flow_block_command cmd); + int (*action)(struct net *net, + const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); void (*free)(struct nf_flowtable *ft); nf_hookfn *hook; struct module *owner; @@ -80,6 +87,9 @@ struct flow_offload_tuple_rhash { #define FLOW_OFFLOAD_DNAT 0x2 #define FLOW_OFFLOAD_DYING 0x4 #define FLOW_OFFLOAD_TEARDOWN 0x8 +#define FLOW_OFFLOAD_HW 0x10 +#define FLOW_OFFLOAD_HW_DYING 0x20 +#define FLOW_OFFLOAD_HW_DEAD 0x40 enum flow_offload_type { NF_FLOW_OFFLOAD_UNSPEC = 0, @@ -142,11 +152,22 @@ unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, #define MODULE_ALIAS_NF_FLOWTABLE(family) \ MODULE_ALIAS("nf-flowtable-" __stringify(family)) -static inline int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, - struct net_device *dev, - enum flow_block_command cmd) -{ - return 0; -} +void nf_flow_offload_add(struct nf_flowtable *flowtable, + struct flow_offload *flow); +void nf_flow_offload_del(struct nf_flowtable *flowtable, + struct flow_offload *flow); +void nf_flow_offload_stats(struct nf_flowtable *flowtable, + struct flow_offload *flow); + +void nf_flow_table_offload_flush(struct nf_flowtable *flowtable); +int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd); +int nf_flow_rule_route(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); + +int nf_flow_table_offload_init(void); +void nf_flow_table_offload_exit(void); #endif /* _NF_FLOW_TABLE_H */ -- cgit v1.2.3 From 25da5eb32cd51383f6dca7aad252376f1979c075 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Oct 2019 16:02:50 +0100 Subject: netfilter: nft_meta: offload support for interface index This patch adds support for offloading the NFT_META_IIF selector. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_offload.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index 03cf5856d76f..ea7d1d78b92d 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -45,6 +45,7 @@ struct nft_flow_key { struct flow_dissector_key_ip ip; struct flow_dissector_key_vlan vlan; struct flow_dissector_key_eth_addrs eth_addrs; + struct flow_dissector_key_meta meta; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct nft_flow_match { -- cgit v1.2.3 From cb711b91a3c685192f2cabd3735ca3de04694ed8 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 14 Nov 2019 01:27:21 +0800 Subject: blk-mq: Delete blk_mq_has_free_tags() and blk_mq_can_queue() These functions are not referenced, so delete them. Signed-off-by: John Garry Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index dc03e059fdff..11cfd6470b1a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -424,7 +424,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_free_request(struct request *rq); -bool blk_mq_can_queue(struct blk_mq_hw_ctx *); bool blk_mq_queue_inflight(struct request_queue *q); -- cgit v1.2.3 From 708edafa883186f55b24fa0c380242b5282f9105 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 14 Nov 2019 01:27:22 +0800 Subject: sbitmap: Delete sbitmap_any_bit_clear() Since the only caller of this function has been deleted, delete this one also. Signed-off-by: John Garry Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index a986ac12a848..e40d019c3d9d 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -216,15 +216,6 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, */ bool sbitmap_any_bit_set(const struct sbitmap *sb); -/** - * sbitmap_any_bit_clear() - Check for an unset bit in a &struct - * sbitmap. - * @sb: Bitmap to check. - * - * Return: true if any bit in the bitmap is clear, false otherwise. - */ -bool sbitmap_any_bit_clear(const struct sbitmap *sb); - #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) -- cgit v1.2.3 From 3ad2522c64cff1f5aebb987b00683268f0cc7c29 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 29 Oct 2019 13:41:38 -0700 Subject: statx: define STATX_ATTR_VERITY Add a statx attribute bit STATX_ATTR_VERITY which will be set if the file has fs-verity enabled. This is the statx() equivalent of FS_VERITY_FL which is returned by FS_IOC_GETFLAGS. This is useful because it allows applications to check whether a file is a verity file without opening it. Opening a verity file can be expensive because the fsverity_info is set up on open, which involves parsing metadata and optionally verifying a cryptographic signature. This is analogous to how various other bits are exposed through both FS_IOC_GETFLAGS and statx(), e.g. the encrypt bit. Reviewed-by: Andreas Dilger Acked-by: Darrick J. Wong Signed-off-by: Eric Biggers --- include/linux/stat.h | 3 ++- include/uapi/linux/stat.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/stat.h b/include/linux/stat.h index 765573dc17d6..528c4baad091 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -33,7 +33,8 @@ struct kstat { STATX_ATTR_IMMUTABLE | \ STATX_ATTR_APPEND | \ STATX_ATTR_NODUMP | \ - STATX_ATTR_ENCRYPTED \ + STATX_ATTR_ENCRYPTED | \ + STATX_ATTR_VERITY \ )/* Attrs corresponding to FS_*_FL flags */ u64 ino; dev_t dev; diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 7b35e98d3c58..ad80a5c885d5 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -167,8 +167,8 @@ struct statx { #define STATX_ATTR_APPEND 0x00000020 /* [I] File is append-only */ #define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */ #define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ - #define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ +#define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #endif /* _UAPI_LINUX_STAT_H */ -- cgit v1.2.3 From 975b992fdd4b38028d7c1dcf38286d6e7991c1b2 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 12 Nov 2019 00:34:29 +0100 Subject: net/mlx5: Add new chain for netfilter flow table offload Netfilter tables (nftables) implements a software datapath that comes after tc ingress datapath. The datapath supports offloading such rules via the flow table offload API. This API is currently only used by NFT and it doesn't provide the global priority in regards to tc offload, so we assume offloading such rules must come after tc. It does provide a flow table priority parameter, so we need to provide some supported priority range. For that, split fastpath prio to two, flow table offload and tc offload, with one dedicated priority chain for flow table offload. Next patch will re-use the multi chain API to access this chain by allowing access to this chain by the fdb_sub_namespace. Signed-off-by: Paul Blakey Reviewed-by: Mark Bloch Acked-by: Pablo Neira Ayuso Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 724d276ea133..4e5b84e66822 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -80,7 +80,8 @@ enum mlx5_flow_namespace_type { enum { FDB_BYPASS_PATH, - FDB_FAST_PATH, + FDB_TC_OFFLOAD, + FDB_FT_OFFLOAD, FDB_SLOW_PATH, }; -- cgit v1.2.3 From ca8ffdaea560a3be3f0701ff4b019fa25a308f82 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 11 Nov 2019 15:45:42 +0100 Subject: xen/mcelog: drop __MC_MSR_MCGCAP It has never been part of Xen's public interface, and there's therefore no guarantee for MCG_CAP's value to always be present in array entry 0. Signed-off-by: Jan Beulich Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- include/xen/interface/xen-mca.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/xen/interface/xen-mca.h b/include/xen/interface/xen-mca.h index 73a4ea714d93..d7a45f08fb48 100644 --- a/include/xen/interface/xen-mca.h +++ b/include/xen/interface/xen-mca.h @@ -183,7 +183,6 @@ struct mc_info { DEFINE_GUEST_HANDLE_STRUCT(mc_info); #define __MC_MSR_ARRAYSIZE 8 -#define __MC_MSR_MCGCAP 0 #define __MC_NMSRS 1 #define MC_NCAPS 7 struct mcinfo_logical_cpu { -- cgit v1.2.3 From 4e3f77d8419b6787f3eb4d4f5178f459d693f9bb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 11 Nov 2019 15:46:26 +0100 Subject: xen/mcelog: add PPIN to record when available This is to augment commit 3f5a7896a5 ("x86/mce: Include the PPIN in MCE records when available"). I'm also adding "synd" and "ipid" fields to struct xen_mce, in an attempt to keep field offsets in sync with struct mce. These two fields won't get populated for now, though. Signed-off-by: Jan Beulich Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- include/xen/interface/xen-mca.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/xen/interface/xen-mca.h b/include/xen/interface/xen-mca.h index d7a45f08fb48..7483a78d2425 100644 --- a/include/xen/interface/xen-mca.h +++ b/include/xen/interface/xen-mca.h @@ -331,7 +331,11 @@ struct xen_mc { }; DEFINE_GUEST_HANDLE_STRUCT(xen_mc); -/* Fields are zero when not available */ +/* + * Fields are zero when not available. Also, this struct is shared with + * userspace mcelog and thus must keep existing fields at current offsets. + * Only add new fields to the end of the structure + */ struct xen_mce { __u64 status; __u64 misc; @@ -352,6 +356,9 @@ struct xen_mce { __u32 socketid; /* CPU socket ID */ __u32 apicid; /* CPU initial apic ID */ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ + __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ + __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ + __u64 ppin; /* Protected Processor Inventory Number */ }; /* -- cgit v1.2.3 From 6917d0689993f46d97d40dd66c601d0fd5b1dbdd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2019 15:34:33 +0100 Subject: block: merge invalidate_partitions into rescan_partitions A lot of the logic in invalidate_partitions and rescan_partitions is shared. Merge the two functions to simplify things. There is a small behavior change in that we now send the kevent change notice also if we were not invalidating but no partitions were found, which seems like the right thing to do. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/genhd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 8b5330dd5ac0..fd7774e64f0b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -622,8 +622,8 @@ extern dev_t blk_lookup_devt(const char *name, int partno); extern char *disk_name (struct gendisk *hd, int partno, char *buf); extern int disk_expand_part_tbl(struct gendisk *disk, int target); -extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); -extern int invalidate_partitions(struct gendisk *disk, struct block_device *bdev); +int rescan_partitions(struct gendisk *disk, struct block_device *bdev, + bool invalidate); extern struct hd_struct * __must_check add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, -- cgit v1.2.3 From a1548b674403c0de70cc29a1575689917ba60157 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2019 15:34:34 +0100 Subject: block: move rescan_partitions to fs/block_dev.c Large parts of rescan_partitions aren't about partitions, and moving it to block_dev.c will allow for some further cleanups by merging it into its only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 -- include/linux/genhd.h | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index e0d909d35763..d233dd661df7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2703,8 +2703,6 @@ extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); #ifdef CONFIG_BLOCK -extern void check_disk_size_change(struct gendisk *disk, - struct block_device *bdev, bool verbose); extern int revalidate_disk(struct gendisk *); extern int check_disk_change(struct block_device *); extern int __invalidate_device(struct block_device *, bool); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index fd7774e64f0b..f5cffbf63abf 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -621,9 +621,9 @@ extern void blk_invalidate_devt(dev_t devt); extern dev_t blk_lookup_devt(const char *name, int partno); extern char *disk_name (struct gendisk *hd, int partno, char *buf); +int blk_add_partitions(struct gendisk *disk, struct block_device *bdev); +int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev); extern int disk_expand_part_tbl(struct gendisk *disk, int target); -int rescan_partitions(struct gendisk *disk, struct block_device *bdev, - bool invalidate); extern struct hd_struct * __must_check add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, -- cgit v1.2.3 From 142fe8f4bb169e8632024d51c64653a8bf140561 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2019 15:34:35 +0100 Subject: block: fix bdev_disk_changed for non-partitioned devices We still have to set the capacity to 0 if invalidating or call revalidate_disk if not even if the disk has no partitions. Fix that by merging rescan_partitions into bdev_disk_changed and just stubbing out blk_add_partitions and blk_drop_partitions for non-partitioned devices. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index f5cffbf63abf..8bb63027e4d6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -621,6 +621,7 @@ extern void blk_invalidate_devt(dev_t devt); extern dev_t blk_lookup_devt(const char *name, int partno); extern char *disk_name (struct gendisk *hd, int partno, char *buf); +int bdev_disk_changed(struct block_device *bdev, bool invalidate); int blk_add_partitions(struct gendisk *disk, struct block_device *bdev); int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev); extern int disk_expand_part_tbl(struct gendisk *disk, int target); -- cgit v1.2.3 From f0b870df80bc70dad432fd0c142bb709a49964f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2019 15:34:36 +0100 Subject: block: remove (__)blkdev_reread_part as an exported API In general drivers should never mess with partition tables directly. Unfortunately s390 and loop do for somewhat historic reasons, but they can use bdev_disk_changed directly instead when we export it as they satisfy the sanity checks we have in __blkdev_reread_part. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Haberland [dasd] Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index d233dd661df7..ae6c5c37f3ae 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2632,8 +2632,6 @@ extern void bd_finish_claiming(struct block_device *bdev, extern void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, void *holder); extern void blkdev_put(struct block_device *bdev, fmode_t mode); -extern int __blkdev_reread_part(struct block_device *bdev); -extern int blkdev_reread_part(struct block_device *bdev); #ifdef CONFIG_SYSFS extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); -- cgit v1.2.3 From bd1903b7c4596ba6f7677d0dfefd05ba5876707d Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Wed, 13 Nov 2019 23:04:49 +0800 Subject: net: openvswitch: add hash info to upcall When using the kernel datapath, the upcall don't include skb hash info relatived. That will introduce some problem, because the hash of skb is important in kernel stack. For example, VXLAN module uses it to select UDP src port. The tx queue selection may also use the hash in stack. Hash is computed in different ways. Hash is random for a TCP socket, and hash may be computed in hardware, or software stack. Recalculation hash is not easy. Hash of TCP socket is computed: tcp_v4_connect -> sk_set_txhash (is random) __tcp_transmit_skb -> skb_set_hash_from_sk There will be one upcall, without information of skb hash, to ovs-vswitchd, for the first packet of a TCP session. The rest packets will be processed in Open vSwitch modules, hash kept. If this tcp session is forward to VXLAN module, then the UDP src port of first tcp packet is different from rest packets. TCP packets may come from the host or dockers, to Open vSwitch. To fix it, we store the hash info to upcall, and restore hash when packets sent back. +---------------+ +-------------------------+ | Docker/VMs | | ovs-vswitchd | +----+----------+ +-+--------------------+--+ | ^ | | | | | | upcall v restore packet hash (not recalculate) | +-+--------------------+--+ | tap netdev | | vxlan module +---------------> +--> Open vSwitch ko +--> or internal type | | +-------------------------+ Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 1887a451c388..a87b44cd5590 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -173,6 +173,7 @@ enum ovs_packet_cmd { * @OVS_PACKET_ATTR_LEN: Packet size before truncation. * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment * size. + * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb). * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_PACKET_* commands. @@ -190,7 +191,8 @@ enum ovs_packet_attr { OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe, error logging should be suppressed. */ OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */ - OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */ + OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */ + OVS_PACKET_ATTR_HASH, /* Packet hash. */ __OVS_PACKET_ATTR_MAX }; -- cgit v1.2.3 From 4d66c56f7efe122d09d06cd3ebfa52a43d51a9cb Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 13 Nov 2019 10:42:25 -0600 Subject: dt-bindings: net: dp83869: Add TI dp83869 phy Add dt bindings for the TI dp83869 Gigabit ethernet phy device. Signed-off-by: Dan Murphy CC: Rob Herring Signed-off-by: David S. Miller --- include/dt-bindings/net/ti-dp83869.h | 42 ++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 include/dt-bindings/net/ti-dp83869.h (limited to 'include') diff --git a/include/dt-bindings/net/ti-dp83869.h b/include/dt-bindings/net/ti-dp83869.h new file mode 100644 index 000000000000..218b1a64e975 --- /dev/null +++ b/include/dt-bindings/net/ti-dp83869.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Device Tree constants for the Texas Instruments DP83869 PHY + * + * Author: Dan Murphy + * + * Copyright: (C) 2019 Texas Instruments, Inc. + */ + +#ifndef _DT_BINDINGS_TI_DP83869_H +#define _DT_BINDINGS_TI_DP83869_H + +/* PHY CTRL bits */ +#define DP83869_PHYCR_FIFO_DEPTH_3_B_NIB 0x00 +#define DP83869_PHYCR_FIFO_DEPTH_4_B_NIB 0x01 +#define DP83869_PHYCR_FIFO_DEPTH_6_B_NIB 0x02 +#define DP83869_PHYCR_FIFO_DEPTH_8_B_NIB 0x03 + +/* IO_MUX_CFG - Clock output selection */ +#define DP83869_CLK_O_SEL_CHN_A_RCLK 0x0 +#define DP83869_CLK_O_SEL_CHN_B_RCLK 0x1 +#define DP83869_CLK_O_SEL_CHN_C_RCLK 0x2 +#define DP83869_CLK_O_SEL_CHN_D_RCLK 0x3 +#define DP83869_CLK_O_SEL_CHN_A_RCLK_DIV5 0x4 +#define DP83869_CLK_O_SEL_CHN_B_RCLK_DIV5 0x5 +#define DP83869_CLK_O_SEL_CHN_C_RCLK_DIV5 0x6 +#define DP83869_CLK_O_SEL_CHN_D_RCLK_DIV5 0x7 +#define DP83869_CLK_O_SEL_CHN_A_TCLK 0x8 +#define DP83869_CLK_O_SEL_CHN_B_TCLK 0x9 +#define DP83869_CLK_O_SEL_CHN_C_TCLK 0xa +#define DP83869_CLK_O_SEL_CHN_D_TCLK 0xb +#define DP83869_CLK_O_SEL_REF_CLK 0xc + +#define DP83869_RGMII_COPPER_ETHERNET 0x00 +#define DP83869_RGMII_1000_BASE 0x01 +#define DP83869_RGMII_100_BASE 0x02 +#define DP83869_RGMII_SGMII_BRIDGE 0x03 +#define DP83869_1000M_MEDIA_CONVERT 0x04 +#define DP83869_100M_MEDIA_CONVERT 0x05 +#define DP83869_SGMII_COPPER_ETHERNET 0x06 + +#endif -- cgit v1.2.3 From db205c766862edae48d64e69e2f2502e2a3e9135 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:37 +0100 Subject: vsock: remove vm_sockets_get_local_cid() vm_sockets_get_local_cid() is only used in virtio_transport_common.c. We can replace it calling the virtio_transport_get_ops() and using the get_local_cid() callback registered by the transport. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/vm_sockets.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/vm_sockets.h b/include/linux/vm_sockets.h index 33f1a2ecd905..7dd899ccb920 100644 --- a/include/linux/vm_sockets.h +++ b/include/linux/vm_sockets.h @@ -10,6 +10,4 @@ #include -int vm_sockets_get_local_cid(void); - #endif /* _VM_SOCKETS_H */ -- cgit v1.2.3 From 3603a2e991a82e5094c3107a792859b08342aed3 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:38 +0100 Subject: vsock: remove include/linux/vm_sockets.h file This header file now only includes the "uapi/linux/vm_sockets.h". We can include directly it when needed. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/vm_sockets.h | 13 ------------- include/net/af_vsock.h | 2 +- include/net/vsock_addr.h | 2 +- 3 files changed, 2 insertions(+), 15 deletions(-) delete mode 100644 include/linux/vm_sockets.h (limited to 'include') diff --git a/include/linux/vm_sockets.h b/include/linux/vm_sockets.h deleted file mode 100644 index 7dd899ccb920..000000000000 --- a/include/linux/vm_sockets.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * VMware vSockets Driver - * - * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. - */ - -#ifndef _VM_SOCKETS_H -#define _VM_SOCKETS_H - -#include - -#endif /* _VM_SOCKETS_H */ diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 80ea0f93d3f7..c660402b10f2 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -10,7 +10,7 @@ #include #include -#include +#include #include "vsock_addr.h" diff --git a/include/net/vsock_addr.h b/include/net/vsock_addr.h index 57d2db5c4bdf..cf8cc140d68d 100644 --- a/include/net/vsock_addr.h +++ b/include/net/vsock_addr.h @@ -8,7 +8,7 @@ #ifndef _VSOCK_ADDR_H_ #define _VSOCK_ADDR_H_ -#include +#include void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port); int vsock_addr_validate(const struct sockaddr_vm *addr); -- cgit v1.2.3 From fe502c4a38d97e5f8b9d5602af1f07f5abc529d2 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:39 +0100 Subject: vsock: add 'transport' member in the struct vsock_sock As a preparation to support multiple transports, this patch adds the 'transport' member at the 'struct vsock_sock'. This new field is initialized during the creation in the __vsock_create() function. This patch also renames the global 'transport' pointer to 'transport_single', since for now we're only supporting a single transport registered at run-time. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index c660402b10f2..a5e1e134261d 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -27,6 +27,7 @@ extern spinlock_t vsock_table_lock; struct vsock_sock { /* sk must be the first member. */ struct sock sk; + const struct vsock_transport *transport; struct sockaddr_vm local_addr; struct sockaddr_vm remote_addr; /* Links for the global tables of bound and connected sockets. */ -- cgit v1.2.3 From 4c7246dc45e2706770d5233f7ce1597a07e069ba Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:40 +0100 Subject: vsock/virtio: add transport parameter to the virtio_transport_reset_no_sock() We are going to add 'struct vsock_sock *' parameter to virtio_transport_get_ops(). In some cases, like in the virtio_transport_reset_no_sock(), we don't have any socket assigned to the packet received, so we can't use the virtio_transport_get_ops(). In order to allow virtio_transport_reset_no_sock() to use the '.send_pkt' callback from the 'vhost_transport' or 'virtio_transport', we add the 'struct virtio_transport *' to it and to its caller: virtio_transport_recv_pkt(). We moved the 'vhost_transport' and 'virtio_transport' definition, to pass their address to the virtio_transport_recv_pkt(). Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 07875ccc7bb5..b139f76060a6 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -150,7 +150,8 @@ virtio_transport_dgram_enqueue(struct vsock_sock *vsk, void virtio_transport_destruct(struct vsock_sock *vsk); -void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_recv_pkt(struct virtio_transport *t, + struct virtio_vsock_pkt *pkt); void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt); u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted); -- cgit v1.2.3 From daabfbca34ecfa936d3bf5219167c4c5e67db150 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:41 +0100 Subject: vsock: add 'struct vsock_sock *' param to vsock_core_get_transport() Since now the 'struct vsock_sock' object contains a pointer to the transport, this patch adds a parameter to the vsock_core_get_transport() to return the right transport assigned to the socket. This patch modifies also the virtio_transport_get_ops(), that uses the vsock_core_get_transport(), adding the 'struct vsock_sock *' parameter. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index a5e1e134261d..2ca67d048de4 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -166,7 +166,7 @@ static inline int vsock_core_init(const struct vsock_transport *t) void vsock_core_exit(void); /* The transport may downcast this to access transport-specific functions */ -const struct vsock_transport *vsock_core_get_transport(void); +const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk); /**** UTILS ****/ -- cgit v1.2.3 From b9f2b0ffde0c9b666b2b1672eb468b8f805a9b97 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:42 +0100 Subject: vsock: handle buffer_size sockopts in the core virtio_transport and vmci_transport handle the buffer_size sockopts in a very similar way. In order to support multiple transports, this patch moves this handling in the core to allow the user to change the options also if the socket is not yet assigned to any transport. This patch also adds the '.notify_buffer_size' callback in the 'struct virtio_transport' in order to inform the transport, when the buffer_size is changed by the user. It is also useful to limit the 'buffer_size' requested (e.g. virtio transports). Acked-by: Dexuan Cui Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 15 +-------------- include/net/af_vsock.h | 15 +++++++-------- 2 files changed, 8 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index b139f76060a6..71c81e0dc8f2 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -7,9 +7,6 @@ #include #include -#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 -#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) -#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) #define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) @@ -25,11 +22,6 @@ enum { struct virtio_vsock_sock { struct vsock_sock *vsk; - /* Protected by lock_sock(sk_vsock(trans->vsk)) */ - u32 buf_size; - u32 buf_size_min; - u32 buf_size_max; - spinlock_t tx_lock; spinlock_t rx_lock; @@ -92,12 +84,6 @@ s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); int virtio_transport_do_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk); -u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); -void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); int virtio_transport_notify_poll_in(struct vsock_sock *vsk, size_t target, @@ -124,6 +110,7 @@ int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, struct vsock_transport_send_notify_data *data); int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written, struct vsock_transport_send_notify_data *data); +void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val); u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); bool virtio_transport_stream_is_active(struct vsock_sock *vsk); diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 2ca67d048de4..4b5d16840fd4 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -65,6 +65,11 @@ struct vsock_sock { bool sent_request; bool ignore_connecting_rst; + /* Protected by lock_sock(sk) */ + u64 buffer_size; + u64 buffer_min_size; + u64 buffer_max_size; + /* Private to transport. */ void *trans; }; @@ -140,18 +145,12 @@ struct vsock_transport { struct vsock_transport_send_notify_data *); int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t, struct vsock_transport_send_notify_data *); + /* sk_lock held by the caller */ + void (*notify_buffer_size)(struct vsock_sock *, u64 *); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); - /* Buffer sizes. */ - void (*set_buffer_size)(struct vsock_sock *, u64); - void (*set_min_buffer_size)(struct vsock_sock *, u64); - void (*set_max_buffer_size)(struct vsock_sock *, u64); - u64 (*get_buffer_size)(struct vsock_sock *); - u64 (*get_min_buffer_size)(struct vsock_sock *); - u64 (*get_max_buffer_size)(struct vsock_sock *); - /* Addressing. */ u32 (*get_local_cid)(void); }; -- cgit v1.2.3 From b9ca2f5ff7784d46285a8f1b14419ac4645096f7 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:43 +0100 Subject: vsock: add vsock_create_connected() called by transports All transports call __vsock_create() with the same parameters, most of them depending on the parent socket. In order to simplify the VSOCK core APIs exposed to the transports, this patch adds the vsock_create_connected() callable from transports to create a new socket when a connection request is received. We also unexported the __vsock_create(). Suggested-by: Stefan Hajnoczi Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 4b5d16840fd4..fa1570dc9f5c 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -76,10 +76,7 @@ struct vsock_sock { s64 vsock_stream_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_space(struct vsock_sock *vsk); -struct sock *__vsock_create(struct net *net, - struct socket *sock, - struct sock *parent, - gfp_t priority, unsigned short type, int kern); +struct sock *vsock_create_connected(struct sock *parent); /**** TRANSPORT ****/ -- cgit v1.2.3 From c0cfa2d8a788fcf45df5bf4070ab2474c88d543a Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:46 +0100 Subject: vsock: add multi-transports support This patch adds the support of multiple transports in the VSOCK core. With the multi-transports support, we can use vsock with nested VMs (using also different hypervisors) loading both guest->host and host->guest transports at the same time. Major changes: - vsock core module can be loaded regardless of the transports - vsock_core_init() and vsock_core_exit() are renamed to vsock_core_register() and vsock_core_unregister() - vsock_core_register() has a feature parameter (H2G, G2H, DGRAM) to identify which directions the transport can handle and if it's support DGRAM (only vmci) - each stream socket is assigned to a transport when the remote CID is set (during the connect() or when we receive a connection request on a listener socket). The remote CID is used to decide which transport to use: - remote CID <= VMADDR_CID_HOST will use guest->host transport; - remote CID == local_cid (guest->host transport) will use guest->host transport for loopback (host->guest transports don't support loopback); - remote CID > VMADDR_CID_HOST will use host->guest transport; - listener sockets are not bound to any transports since no transport operations are done on it. In this way we can create a listener socket, also if the transports are not loaded or with VMADDR_CID_ANY to listen on all transports. - DGRAM sockets are handled as before, since only the vmci_transport provides this feature. Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index fa1570dc9f5c..cf5c3691251b 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -91,6 +91,14 @@ struct vsock_transport_send_notify_data { u64 data2; /* Transport-defined. */ }; +/* Transport features flags */ +/* Transport provides host->guest communication */ +#define VSOCK_TRANSPORT_F_H2G 0x00000001 +/* Transport provides guest->host communication */ +#define VSOCK_TRANSPORT_F_G2H 0x00000002 +/* Transport provides DGRAM communication */ +#define VSOCK_TRANSPORT_F_DGRAM 0x00000004 + struct vsock_transport { /* Initialize/tear-down socket. */ int (*init)(struct vsock_sock *, struct vsock_sock *); @@ -154,12 +162,8 @@ struct vsock_transport { /**** CORE ****/ -int __vsock_core_init(const struct vsock_transport *t, struct module *owner); -static inline int vsock_core_init(const struct vsock_transport *t) -{ - return __vsock_core_init(t, THIS_MODULE); -} -void vsock_core_exit(void); +int vsock_core_register(const struct vsock_transport *t, int features); +void vsock_core_unregister(const struct vsock_transport *t); /* The transport may downcast this to access transport-specific functions */ const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk); @@ -190,6 +194,8 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_remove_sock(struct vsock_sock *vsk); void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); +int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); +bool vsock_find_cid(unsigned int cid); /**** TAP ****/ -- cgit v1.2.3 From b1bba80a4376aef34de2b57bfb8834bd095703ed Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:47 +0100 Subject: vsock/vmci: register vmci_transport only when VMCI guest/host are active To allow other transports to be loaded with vmci_transport, we register the vmci_transport as G2H or H2G only when a VMCI guest or host is active. To do that, this patch adds a callback registered in the vmci driver that will be called when the host or guest becomes active. This callback will register the vmci_transport in the VSOCK core. Cc: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/vmw_vmci_api.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/vmw_vmci_api.h b/include/linux/vmw_vmci_api.h index acd9fafe4fc6..f28907345c80 100644 --- a/include/linux/vmw_vmci_api.h +++ b/include/linux/vmw_vmci_api.h @@ -19,6 +19,7 @@ struct msghdr; typedef void (vmci_device_shutdown_fn) (void *device_registration, void *user_data); +typedef void (*vmci_vsock_cb) (bool is_host); int vmci_datagram_create_handle(u32 resource_id, u32 flags, vmci_datagram_recv_cb recv_cb, @@ -37,6 +38,7 @@ int vmci_doorbell_destroy(struct vmci_handle handle); int vmci_doorbell_notify(struct vmci_handle handle, u32 priv_flags); u32 vmci_get_context_id(void); bool vmci_is_context_owner(u32 context_id, kuid_t uid); +int vmci_register_vsock_callback(vmci_vsock_cb callback); int vmci_event_subscribe(u32 event, vmci_event_cb callback, void *callback_data, -- cgit v1.2.3 From 6a2c0962105ae8ceba182c4f616e0e41d7755591 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:48 +0100 Subject: vsock: prevent transport modules unloading This patch adds 'module' member in the 'struct vsock_transport' in order to get/put the transport module. This prevents the module unloading while sockets are assigned to it. We increase the module refcnt when a socket is assigned to a transport, and we decrease the module refcnt when the socket is destructed. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/net/af_vsock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index cf5c3691251b..4206dc6d813f 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -100,6 +100,8 @@ struct vsock_transport_send_notify_data { #define VSOCK_TRANSPORT_F_DGRAM 0x00000004 struct vsock_transport { + struct module *module; + /* Initialize/tear-down socket. */ int (*init)(struct vsock_sock *, struct vsock_sock *); void (*destruct)(struct vsock_sock *); -- cgit v1.2.3 From 3ca270fc9edb258d5bfa271bcf851614e9e6e7d4 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sun, 27 Oct 2019 18:52:38 +0800 Subject: perf/core: Provide a kernel-internal interface to recalibrate event period Currently, perf_event_period() is used by user tools via ioctl. Based on naming convention, exporting perf_event_period() for kernel users (such as KVM) who may recalibrate the event period for their assigned counter according to their requirements. The perf_event_period() is an external accessor, just like the perf_event_{en,dis}able() and should thus use perf_event_ctx_lock(). Suggested-by: Kan Liang Signed-off-by: Like Xu Acked-by: Peter Zijlstra Signed-off-by: Paolo Bonzini --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61448c19a132..d601df36e671 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1336,6 +1336,7 @@ extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); +extern int perf_event_period(struct perf_event *event, u64 value); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, @@ -1415,6 +1416,10 @@ static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } +static inline int perf_event_period(struct perf_event *event, u64 value) +{ + return -EINVAL; +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) -- cgit v1.2.3 From 52ba4b0b99770e892f43da1238f437155acb8b58 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sun, 27 Oct 2019 18:52:39 +0800 Subject: perf/core: Provide a kernel-internal interface to pause perf_event Exporting perf_event_pause() as an external accessor for kernel users (such as KVM) who may do both disable perf_event and read count with just one time to hold perf_event_ctx_lock. Also the value could be reset optionally. Suggested-by: Peter Zijlstra Signed-off-by: Like Xu Acked-by: Peter Zijlstra Signed-off-by: Paolo Bonzini --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d601df36e671..e9768bfc76f6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1337,6 +1337,7 @@ extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); +extern u64 perf_event_pause(struct perf_event *event, bool reset); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, @@ -1420,6 +1421,10 @@ static inline int perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } +static inline u64 perf_event_pause(struct perf_event *event, bool reset) +{ + return 0; +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) -- cgit v1.2.3 From 8750e72a79dda2f665ce17b62049f4d62130d991 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Thu, 7 Nov 2019 07:53:42 -0500 Subject: KVM: remember position in kvm->vcpus array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fetching an index for any vcpu in kvm->vcpus array by traversing the entire array everytime is costly. This patch remembers the position of each vcpu in kvm->vcpus array by storing it in vcpus_idx under kvm_vcpu structure. Signed-off-by: Radim Krčmář Signed-off-by: Nitesh Narayan Lal Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a817e446c9aa..70b2296fb2ae 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -266,7 +266,8 @@ struct kvm_vcpu { struct preempt_notifier preempt_notifier; #endif int cpu; - int vcpu_id; + int vcpu_id; /* id given by userspace at creation */ + int vcpu_idx; /* index in kvm->vcpus array */ int srcu_idx; int mode; u64 requests; @@ -570,13 +571,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *tmp; - int idx; - - kvm_for_each_vcpu(idx, tmp, vcpu->kvm) - if (tmp == vcpu) - return idx; - BUG(); + return vcpu->vcpu_idx; } #define kvm_for_each_memslot(memslot, slots) \ -- cgit v1.2.3 From 7ee30bc132c683d06a6d9e360e39e483e3990708 Mon Sep 17 00:00:00 2001 From: Nitesh Narayan Lal Date: Thu, 7 Nov 2019 07:53:43 -0500 Subject: KVM: x86: deliver KVM IOAPIC scan request to target vCPUs In IOAPIC fixed delivery mode instead of flushing the scan requests to all vCPUs, we should only send the requests to vCPUs specified within the destination field. This patch introduces kvm_get_dest_vcpus_mask() API which retrieves an array of target vCPUs by using kvm_apic_map_get_dest_lapic() and then based on the vcpus_idx, it sets the bit in a bitmap. However, if the above fails kvm_get_dest_vcpus_mask() finds the target vCPUs by traversing all available vCPUs. Followed by setting the bits in the bitmap. If we had different vCPUs in the previous request for the same redirection table entry then bits corresponding to these vCPUs are also set. This to done to keep ioapic_handled_vectors synchronized. This bitmap is then eventually passed on to kvm_make_vcpus_request_mask() to generate a masked request only for the target vCPUs. This would enable us to reduce the latency overhead on isolated vCPUs caused by the IPI to process due to KVM_REQ_IOAPIC_SCAN. Suggested-by: Marcelo Tosatti Signed-off-by: Nitesh Narayan Lal Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 70b2296fb2ae..bfe6c6729988 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -786,6 +786,8 @@ void kvm_reload_remote_mmus(struct kvm *kvm); bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, unsigned long *vcpu_bitmap, cpumask_var_t tmp); bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); +bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, + unsigned long *vcpu_bitmap); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); -- cgit v1.2.3 From 7d34aec52d292197d6b13395b023c718cac4630f Mon Sep 17 00:00:00 2001 From: Christoph Fritz Date: Wed, 13 Nov 2019 14:40:13 +0100 Subject: regulator: da9062: refactor buck modes into header This patch refactors buck modes into a header file so that device trees can make use of these mode constants. The new header filename uses da9063 because DA9063 was the earlier chip and its driver code will want updating at some point in a similar manner. Signed-off-by: Christoph Fritz Link: https://lore.kernel.org/r/1573652416-9848-2-git-send-email-chf.fritz@googlemail.com Signed-off-by: Mark Brown --- include/dt-bindings/regulator/dlg,da9063-regulator.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/dt-bindings/regulator/dlg,da9063-regulator.h (limited to 'include') diff --git a/include/dt-bindings/regulator/dlg,da9063-regulator.h b/include/dt-bindings/regulator/dlg,da9063-regulator.h new file mode 100644 index 000000000000..1de710dd0899 --- /dev/null +++ b/include/dt-bindings/regulator/dlg,da9063-regulator.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _DT_BINDINGS_REGULATOR_DLG_DA9063_H +#define _DT_BINDINGS_REGULATOR_DLG_DA9063_H + +/* + * These buck mode constants may be used to specify values in device tree + * properties (e.g. regulator-initial-mode). + * A description of the following modes is in the manufacturers datasheet. + */ + +#define DA9063_BUCK_MODE_SLEEP 1 +#define DA9063_BUCK_MODE_SYNC 2 +#define DA9063_BUCK_MODE_AUTO 3 + +#endif -- cgit v1.2.3 From 42bfba9eaa33dd4af0b50b87508062a41ec26653 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 14 Nov 2019 13:02:41 +0100 Subject: net/smc: immediate termination for SMCD link groups SMCD link group termination is called when peer signals its shutdown of its corresponding link group. For regular shutdowns no connections exist anymore. For abnormal shutdowns connections must be killed and their DMBs must be unregistered immediately. That means the SMCR method to delay the link group freeing several seconds does not fit. This patch adds immediate termination of a link group and its SMCD connections and makes sure all SMCD link group related cleanup steps are finished. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- include/net/smc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index 05174ae4f325..7c2082341bb3 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -37,6 +37,8 @@ struct smcd_dmb { #define ISM_EVENT_GID 1 #define ISM_EVENT_SWR 2 +#define ISM_ERROR 0xFFFF + struct smcd_event { u32 type; u32 code; -- cgit v1.2.3 From 5edd6b9cb8d7c6c346c93c52a53735591127e879 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 14 Nov 2019 13:02:43 +0100 Subject: net/smc: introduce bookkeeping of SMCD link groups If the ism module is unloaded return control from exit routine only, if all link groups are freed. If an IB device is thrown away return control from device removal only, if all link groups belonging to this device are freed. A counters for the total number of SMCD link groups per ISM device is introduced. ism module unloading continues only if the total number of SMCD link groups for all ISM devices is zero. ISM device removal continues only it the total number of SMCD link groups per ISM device has decreased to zero. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- include/net/smc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index 7c2082341bb3..646feb4bc75f 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -79,6 +79,8 @@ struct smcd_dev { bool pnetid_by_user; struct list_head lgr_list; spinlock_t lgr_lock; + atomic_t lgr_cnt; + wait_queue_head_t lgrs_deleted; u8 going_away : 1; }; -- cgit v1.2.3 From 5e2563650232a4d998a60b10d3679f65dd4c02fb Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 14 Nov 2019 17:03:27 +0200 Subject: net: mscc: ocelot: publish structure definitions to include/soc/mscc/ocelot.h We will be registering another switch driver based on ocelot, which lives under drivers/net/dsa. Make sure the Felix DSA front-end has the necessary abstractions to implement a new Ocelot driver instantiation. This includes the function prototypes for implementing DSA callbacks. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 539 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 539 insertions(+) create mode 100644 include/soc/mscc/ocelot.h (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h new file mode 100644 index 000000000000..a836afe8f68e --- /dev/null +++ b/include/soc/mscc/ocelot.h @@ -0,0 +1,539 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* Copyright (c) 2017 Microsemi Corporation + */ + +#ifndef _SOC_MSCC_OCELOT_H +#define _SOC_MSCC_OCELOT_H + +#include +#include +#include +#include +#include + +#define IFH_INJ_BYPASS BIT(31) +#define IFH_INJ_POP_CNT_DISABLE (3 << 28) + +#define IFH_TAG_TYPE_C 0 +#define IFH_TAG_TYPE_S 1 + +#define IFH_REW_OP_NOOP 0x0 +#define IFH_REW_OP_DSCP 0x1 +#define IFH_REW_OP_ONE_STEP_PTP 0x2 +#define IFH_REW_OP_TWO_STEP_PTP 0x3 +#define IFH_REW_OP_ORIGIN_PTP 0x5 + +#define OCELOT_TAG_LEN 16 +#define OCELOT_SHORT_PREFIX_LEN 4 +#define OCELOT_LONG_PREFIX_LEN 16 + +#define OCELOT_SPEED_2500 0 +#define OCELOT_SPEED_1000 1 +#define OCELOT_SPEED_100 2 +#define OCELOT_SPEED_10 3 + +#define TARGET_OFFSET 24 +#define REG_MASK GENMASK(TARGET_OFFSET - 1, 0) +#define REG(reg, offset) [reg & REG_MASK] = offset + +#define REG_RESERVED_ADDR 0xffffffff +#define REG_RESERVED(reg) REG(reg, REG_RESERVED_ADDR) + +enum ocelot_target { + ANA = 1, + QS, + QSYS, + REW, + SYS, + S2, + HSIO, + PTP, + GCB, + TARGET_MAX, +}; + +enum ocelot_reg { + ANA_ADVLEARN = ANA << TARGET_OFFSET, + ANA_VLANMASK, + ANA_PORT_B_DOMAIN, + ANA_ANAGEFIL, + ANA_ANEVENTS, + ANA_STORMLIMIT_BURST, + ANA_STORMLIMIT_CFG, + ANA_ISOLATED_PORTS, + ANA_COMMUNITY_PORTS, + ANA_AUTOAGE, + ANA_MACTOPTIONS, + ANA_LEARNDISC, + ANA_AGENCTRL, + ANA_MIRRORPORTS, + ANA_EMIRRORPORTS, + ANA_FLOODING, + ANA_FLOODING_IPMC, + ANA_SFLOW_CFG, + ANA_PORT_MODE, + ANA_CUT_THRU_CFG, + ANA_PGID_PGID, + ANA_TABLES_ANMOVED, + ANA_TABLES_MACHDATA, + ANA_TABLES_MACLDATA, + ANA_TABLES_STREAMDATA, + ANA_TABLES_MACACCESS, + ANA_TABLES_MACTINDX, + ANA_TABLES_VLANACCESS, + ANA_TABLES_VLANTIDX, + ANA_TABLES_ISDXACCESS, + ANA_TABLES_ISDXTIDX, + ANA_TABLES_ENTRYLIM, + ANA_TABLES_PTP_ID_HIGH, + ANA_TABLES_PTP_ID_LOW, + ANA_TABLES_STREAMACCESS, + ANA_TABLES_STREAMTIDX, + ANA_TABLES_SEQ_HISTORY, + ANA_TABLES_SEQ_MASK, + ANA_TABLES_SFID_MASK, + ANA_TABLES_SFIDACCESS, + ANA_TABLES_SFIDTIDX, + ANA_MSTI_STATE, + ANA_OAM_UPM_LM_CNT, + ANA_SG_ACCESS_CTRL, + ANA_SG_CONFIG_REG_1, + ANA_SG_CONFIG_REG_2, + ANA_SG_CONFIG_REG_3, + ANA_SG_CONFIG_REG_4, + ANA_SG_CONFIG_REG_5, + ANA_SG_GCL_GS_CONFIG, + ANA_SG_GCL_TI_CONFIG, + ANA_SG_STATUS_REG_1, + ANA_SG_STATUS_REG_2, + ANA_SG_STATUS_REG_3, + ANA_PORT_VLAN_CFG, + ANA_PORT_DROP_CFG, + ANA_PORT_QOS_CFG, + ANA_PORT_VCAP_CFG, + ANA_PORT_VCAP_S1_KEY_CFG, + ANA_PORT_VCAP_S2_CFG, + ANA_PORT_PCP_DEI_MAP, + ANA_PORT_CPU_FWD_CFG, + ANA_PORT_CPU_FWD_BPDU_CFG, + ANA_PORT_CPU_FWD_GARP_CFG, + ANA_PORT_CPU_FWD_CCM_CFG, + ANA_PORT_PORT_CFG, + ANA_PORT_POL_CFG, + ANA_PORT_PTP_CFG, + ANA_PORT_PTP_DLY1_CFG, + ANA_PORT_PTP_DLY2_CFG, + ANA_PORT_SFID_CFG, + ANA_PFC_PFC_CFG, + ANA_PFC_PFC_TIMER, + ANA_IPT_OAM_MEP_CFG, + ANA_IPT_IPT, + ANA_PPT_PPT, + ANA_FID_MAP_FID_MAP, + ANA_AGGR_CFG, + ANA_CPUQ_CFG, + ANA_CPUQ_CFG2, + ANA_CPUQ_8021_CFG, + ANA_DSCP_CFG, + ANA_DSCP_REWR_CFG, + ANA_VCAP_RNG_TYPE_CFG, + ANA_VCAP_RNG_VAL_CFG, + ANA_VRAP_CFG, + ANA_VRAP_HDR_DATA, + ANA_VRAP_HDR_MASK, + ANA_DISCARD_CFG, + ANA_FID_CFG, + ANA_POL_PIR_CFG, + ANA_POL_CIR_CFG, + ANA_POL_MODE_CFG, + ANA_POL_PIR_STATE, + ANA_POL_CIR_STATE, + ANA_POL_STATE, + ANA_POL_FLOWC, + ANA_POL_HYST, + ANA_POL_MISC_CFG, + QS_XTR_GRP_CFG = QS << TARGET_OFFSET, + QS_XTR_RD, + QS_XTR_FRM_PRUNING, + QS_XTR_FLUSH, + QS_XTR_DATA_PRESENT, + QS_XTR_CFG, + QS_INJ_GRP_CFG, + QS_INJ_WR, + QS_INJ_CTRL, + QS_INJ_STATUS, + QS_INJ_ERR, + QS_INH_DBG, + QSYS_PORT_MODE = QSYS << TARGET_OFFSET, + QSYS_SWITCH_PORT_MODE, + QSYS_STAT_CNT_CFG, + QSYS_EEE_CFG, + QSYS_EEE_THRES, + QSYS_IGR_NO_SHARING, + QSYS_EGR_NO_SHARING, + QSYS_SW_STATUS, + QSYS_EXT_CPU_CFG, + QSYS_PAD_CFG, + QSYS_CPU_GROUP_MAP, + QSYS_QMAP, + QSYS_ISDX_SGRP, + QSYS_TIMED_FRAME_ENTRY, + QSYS_TFRM_MISC, + QSYS_TFRM_PORT_DLY, + QSYS_TFRM_TIMER_CFG_1, + QSYS_TFRM_TIMER_CFG_2, + QSYS_TFRM_TIMER_CFG_3, + QSYS_TFRM_TIMER_CFG_4, + QSYS_TFRM_TIMER_CFG_5, + QSYS_TFRM_TIMER_CFG_6, + QSYS_TFRM_TIMER_CFG_7, + QSYS_TFRM_TIMER_CFG_8, + QSYS_RED_PROFILE, + QSYS_RES_QOS_MODE, + QSYS_RES_CFG, + QSYS_RES_STAT, + QSYS_EGR_DROP_MODE, + QSYS_EQ_CTRL, + QSYS_EVENTS_CORE, + QSYS_QMAXSDU_CFG_0, + QSYS_QMAXSDU_CFG_1, + QSYS_QMAXSDU_CFG_2, + QSYS_QMAXSDU_CFG_3, + QSYS_QMAXSDU_CFG_4, + QSYS_QMAXSDU_CFG_5, + QSYS_QMAXSDU_CFG_6, + QSYS_QMAXSDU_CFG_7, + QSYS_PREEMPTION_CFG, + QSYS_CIR_CFG, + QSYS_EIR_CFG, + QSYS_SE_CFG, + QSYS_SE_DWRR_CFG, + QSYS_SE_CONNECT, + QSYS_SE_DLB_SENSE, + QSYS_CIR_STATE, + QSYS_EIR_STATE, + QSYS_SE_STATE, + QSYS_HSCH_MISC_CFG, + QSYS_TAG_CONFIG, + QSYS_TAS_PARAM_CFG_CTRL, + QSYS_PORT_MAX_SDU, + QSYS_PARAM_CFG_REG_1, + QSYS_PARAM_CFG_REG_2, + QSYS_PARAM_CFG_REG_3, + QSYS_PARAM_CFG_REG_4, + QSYS_PARAM_CFG_REG_5, + QSYS_GCL_CFG_REG_1, + QSYS_GCL_CFG_REG_2, + QSYS_PARAM_STATUS_REG_1, + QSYS_PARAM_STATUS_REG_2, + QSYS_PARAM_STATUS_REG_3, + QSYS_PARAM_STATUS_REG_4, + QSYS_PARAM_STATUS_REG_5, + QSYS_PARAM_STATUS_REG_6, + QSYS_PARAM_STATUS_REG_7, + QSYS_PARAM_STATUS_REG_8, + QSYS_PARAM_STATUS_REG_9, + QSYS_GCL_STATUS_REG_1, + QSYS_GCL_STATUS_REG_2, + REW_PORT_VLAN_CFG = REW << TARGET_OFFSET, + REW_TAG_CFG, + REW_PORT_CFG, + REW_DSCP_CFG, + REW_PCP_DEI_QOS_MAP_CFG, + REW_PTP_CFG, + REW_PTP_DLY1_CFG, + REW_RED_TAG_CFG, + REW_DSCP_REMAP_DP1_CFG, + REW_DSCP_REMAP_CFG, + REW_STAT_CFG, + REW_REW_STICKY, + REW_PPT, + SYS_COUNT_RX_OCTETS = SYS << TARGET_OFFSET, + SYS_COUNT_RX_UNICAST, + SYS_COUNT_RX_MULTICAST, + SYS_COUNT_RX_BROADCAST, + SYS_COUNT_RX_SHORTS, + SYS_COUNT_RX_FRAGMENTS, + SYS_COUNT_RX_JABBERS, + SYS_COUNT_RX_CRC_ALIGN_ERRS, + SYS_COUNT_RX_SYM_ERRS, + SYS_COUNT_RX_64, + SYS_COUNT_RX_65_127, + SYS_COUNT_RX_128_255, + SYS_COUNT_RX_256_1023, + SYS_COUNT_RX_1024_1526, + SYS_COUNT_RX_1527_MAX, + SYS_COUNT_RX_PAUSE, + SYS_COUNT_RX_CONTROL, + SYS_COUNT_RX_LONGS, + SYS_COUNT_RX_CLASSIFIED_DROPS, + SYS_COUNT_TX_OCTETS, + SYS_COUNT_TX_UNICAST, + SYS_COUNT_TX_MULTICAST, + SYS_COUNT_TX_BROADCAST, + SYS_COUNT_TX_COLLISION, + SYS_COUNT_TX_DROPS, + SYS_COUNT_TX_PAUSE, + SYS_COUNT_TX_64, + SYS_COUNT_TX_65_127, + SYS_COUNT_TX_128_511, + SYS_COUNT_TX_512_1023, + SYS_COUNT_TX_1024_1526, + SYS_COUNT_TX_1527_MAX, + SYS_COUNT_TX_AGING, + SYS_RESET_CFG, + SYS_SR_ETYPE_CFG, + SYS_VLAN_ETYPE_CFG, + SYS_PORT_MODE, + SYS_FRONT_PORT_MODE, + SYS_FRM_AGING, + SYS_STAT_CFG, + SYS_SW_STATUS, + SYS_MISC_CFG, + SYS_REW_MAC_HIGH_CFG, + SYS_REW_MAC_LOW_CFG, + SYS_TIMESTAMP_OFFSET, + SYS_CMID, + SYS_PAUSE_CFG, + SYS_PAUSE_TOT_CFG, + SYS_ATOP, + SYS_ATOP_TOT_CFG, + SYS_MAC_FC_CFG, + SYS_MMGT, + SYS_MMGT_FAST, + SYS_EVENTS_DIF, + SYS_EVENTS_CORE, + SYS_CNT, + SYS_PTP_STATUS, + SYS_PTP_TXSTAMP, + SYS_PTP_NXT, + SYS_PTP_CFG, + SYS_RAM_INIT, + SYS_CM_ADDR, + SYS_CM_DATA_WR, + SYS_CM_DATA_RD, + SYS_CM_OP, + SYS_CM_DATA, + S2_CORE_UPDATE_CTRL = S2 << TARGET_OFFSET, + S2_CORE_MV_CFG, + S2_CACHE_ENTRY_DAT, + S2_CACHE_MASK_DAT, + S2_CACHE_ACTION_DAT, + S2_CACHE_CNT_DAT, + S2_CACHE_TG_DAT, + PTP_PIN_CFG = PTP << TARGET_OFFSET, + PTP_PIN_TOD_SEC_MSB, + PTP_PIN_TOD_SEC_LSB, + PTP_PIN_TOD_NSEC, + PTP_CFG_MISC, + PTP_CLK_CFG_ADJ_CFG, + PTP_CLK_CFG_ADJ_FREQ, + GCB_SOFT_RST = GCB << TARGET_OFFSET, +}; + +enum ocelot_regfield { + ANA_ADVLEARN_VLAN_CHK, + ANA_ADVLEARN_LEARN_MIRROR, + ANA_ANEVENTS_FLOOD_DISCARD, + ANA_ANEVENTS_MSTI_DROP, + ANA_ANEVENTS_ACLKILL, + ANA_ANEVENTS_ACLUSED, + ANA_ANEVENTS_AUTOAGE, + ANA_ANEVENTS_VS2TTL1, + ANA_ANEVENTS_STORM_DROP, + ANA_ANEVENTS_LEARN_DROP, + ANA_ANEVENTS_AGED_ENTRY, + ANA_ANEVENTS_CPU_LEARN_FAILED, + ANA_ANEVENTS_AUTO_LEARN_FAILED, + ANA_ANEVENTS_LEARN_REMOVE, + ANA_ANEVENTS_AUTO_LEARNED, + ANA_ANEVENTS_AUTO_MOVED, + ANA_ANEVENTS_DROPPED, + ANA_ANEVENTS_CLASSIFIED_DROP, + ANA_ANEVENTS_CLASSIFIED_COPY, + ANA_ANEVENTS_VLAN_DISCARD, + ANA_ANEVENTS_FWD_DISCARD, + ANA_ANEVENTS_MULTICAST_FLOOD, + ANA_ANEVENTS_UNICAST_FLOOD, + ANA_ANEVENTS_DEST_KNOWN, + ANA_ANEVENTS_BUCKET3_MATCH, + ANA_ANEVENTS_BUCKET2_MATCH, + ANA_ANEVENTS_BUCKET1_MATCH, + ANA_ANEVENTS_BUCKET0_MATCH, + ANA_ANEVENTS_CPU_OPERATION, + ANA_ANEVENTS_DMAC_LOOKUP, + ANA_ANEVENTS_SMAC_LOOKUP, + ANA_ANEVENTS_SEQ_GEN_ERR_0, + ANA_ANEVENTS_SEQ_GEN_ERR_1, + ANA_TABLES_MACACCESS_B_DOM, + ANA_TABLES_MACTINDX_BUCKET, + ANA_TABLES_MACTINDX_M_INDEX, + QSYS_TIMED_FRAME_ENTRY_TFRM_VLD, + QSYS_TIMED_FRAME_ENTRY_TFRM_FP, + QSYS_TIMED_FRAME_ENTRY_TFRM_PORTNO, + QSYS_TIMED_FRAME_ENTRY_TFRM_TM_SEL, + QSYS_TIMED_FRAME_ENTRY_TFRM_TM_T, + SYS_RESET_CFG_CORE_ENA, + SYS_RESET_CFG_MEM_ENA, + SYS_RESET_CFG_MEM_INIT, + GCB_SOFT_RST_SWC_RST, + REGFIELD_MAX +}; + +enum ocelot_clk_pins { + ALT_PPS_PIN = 1, + EXT_CLK_PIN, + ALT_LDST_PIN, + TOD_ACC_PIN +}; + +struct ocelot_stat_layout { + u32 offset; + char name[ETH_GSTRING_LEN]; +}; + +enum ocelot_tag_prefix { + OCELOT_TAG_PREFIX_DISABLED = 0, + OCELOT_TAG_PREFIX_NONE, + OCELOT_TAG_PREFIX_SHORT, + OCELOT_TAG_PREFIX_LONG, +}; + +struct ocelot; + +struct ocelot_ops { + void (*pcs_init)(struct ocelot *ocelot, int port); + int (*reset)(struct ocelot *ocelot); +}; + +struct ocelot_port { + struct ocelot *ocelot; + + void __iomem *regs; + + /* Ingress default VLAN (pvid) */ + u16 pvid; + + /* Egress default VLAN (vid) */ + u16 vid; + + u8 ptp_cmd; + struct list_head skbs; + u8 ts_id; +}; + +struct ocelot { + struct device *dev; + + const struct ocelot_ops *ops; + struct regmap *targets[TARGET_MAX]; + struct regmap_field *regfields[REGFIELD_MAX]; + const u32 *const *map; + const struct ocelot_stat_layout *stats_layout; + unsigned int num_stats; + + int shared_queue_sz; + + struct net_device *hw_bridge_dev; + u16 bridge_mask; + u16 bridge_fwd_mask; + + struct ocelot_port **ports; + + u8 base_mac[ETH_ALEN]; + + /* Keep track of the vlan port masks */ + u32 vlan_mask[VLAN_N_VID]; + + u8 num_phys_ports; + u8 num_cpu_ports; + u8 cpu; + + u32 *lags; + + struct list_head multicast; + + /* Workqueue to check statistics for overflow with its lock */ + struct mutex stats_lock; + u64 *stats; + struct delayed_work stats_work; + struct workqueue_struct *stats_queue; + + u8 ptp:1; + struct ptp_clock *ptp_clock; + struct ptp_clock_info ptp_info; + struct hwtstamp_config hwtstamp_config; + /* Protects the PTP interface state */ + struct mutex ptp_lock; + /* Protects the PTP clock */ + spinlock_t ptp_clock_lock; + + void (*port_pcs_init)(struct ocelot_port *port); +}; + +#define ocelot_read_ix(ocelot, reg, gi, ri) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) +#define ocelot_read_gix(ocelot, reg, gi) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi)) +#define ocelot_read_rix(ocelot, reg, ri) __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri)) +#define ocelot_read(ocelot, reg) __ocelot_read_ix(ocelot, reg, 0) + +#define ocelot_write_ix(ocelot, val, reg, gi, ri) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) +#define ocelot_write_gix(ocelot, val, reg, gi) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi)) +#define ocelot_write_rix(ocelot, val, reg, ri) __ocelot_write_ix(ocelot, val, reg, reg##_RSZ * (ri)) +#define ocelot_write(ocelot, val, reg) __ocelot_write_ix(ocelot, val, reg, 0) + +#define ocelot_rmw_ix(ocelot, val, m, reg, gi, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) +#define ocelot_rmw_gix(ocelot, val, m, reg, gi) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi)) +#define ocelot_rmw_rix(ocelot, val, m, reg, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_RSZ * (ri)) +#define ocelot_rmw(ocelot, val, m, reg) __ocelot_rmw_ix(ocelot, val, m, reg, 0) + +/* I/O */ +u32 ocelot_port_readl(struct ocelot_port *port, u32 reg); +void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg); +u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset); +void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset); +void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg, + u32 offset); + +/* Hardware initialization */ +int ocelot_regfields_init(struct ocelot *ocelot, + const struct reg_field *const regfields); +struct regmap *ocelot_regmap_init(struct ocelot *ocelot, struct resource *res); +void ocelot_set_cpu_port(struct ocelot *ocelot, int cpu, + enum ocelot_tag_prefix injection, + enum ocelot_tag_prefix extraction); +int ocelot_init(struct ocelot *ocelot); +void ocelot_deinit(struct ocelot *ocelot); +void ocelot_init_port(struct ocelot *ocelot, int port); + +/* DSA callbacks */ +void ocelot_port_enable(struct ocelot *ocelot, int port, + struct phy_device *phy); +void ocelot_port_disable(struct ocelot *ocelot, int port); +void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset, u8 *data); +void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data); +int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset); +int ocelot_get_ts_info(struct ocelot *ocelot, int port, + struct ethtool_ts_info *info); +void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs); +void ocelot_adjust_link(struct ocelot *ocelot, int port, + struct phy_device *phydev); +void ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, + bool vlan_aware); +void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state); +int ocelot_port_bridge_join(struct ocelot *ocelot, int port, + struct net_device *bridge); +int ocelot_port_bridge_leave(struct ocelot *ocelot, int port, + struct net_device *bridge); +int ocelot_fdb_dump(struct ocelot *ocelot, int port, + dsa_fdb_dump_cb_t *cb, void *data); +int ocelot_fdb_add(struct ocelot *ocelot, int port, + const unsigned char *addr, u16 vid, bool vlan_aware); +int ocelot_fdb_del(struct ocelot *ocelot, int port, + const unsigned char *addr, u16 vid); +int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, + bool untagged); +int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); +int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts); +void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts); + +#endif -- cgit v1.2.3 From a030dfe1947310a2140b9e371dc9ebfab72c914f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 14 Nov 2019 17:03:28 +0200 Subject: net: mscc: ocelot: publish ocelot_sys.h to include/soc/mscc The Felix DSA driver needs to write to SYS_RAM_INIT_RAM_INIT for its own chip initialization process. Also update the MAINTAINERS file such that the headers exported by the ocelot driver are under the same maintainers' umbrella as the driver itself. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/soc/mscc/ocelot_sys.h | 144 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 include/soc/mscc/ocelot_sys.h (limited to 'include') diff --git a/include/soc/mscc/ocelot_sys.h b/include/soc/mscc/ocelot_sys.h new file mode 100644 index 000000000000..16f91e172bcb --- /dev/null +++ b/include/soc/mscc/ocelot_sys.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* + * Microsemi Ocelot Switch driver + * + * Copyright (c) 2017 Microsemi Corporation + */ + +#ifndef _MSCC_OCELOT_SYS_H_ +#define _MSCC_OCELOT_SYS_H_ + +#define SYS_COUNT_RX_OCTETS_RSZ 0x4 + +#define SYS_COUNT_TX_OCTETS_RSZ 0x4 + +#define SYS_PORT_MODE_RSZ 0x4 + +#define SYS_PORT_MODE_DATA_WO_TS(x) (((x) << 5) & GENMASK(6, 5)) +#define SYS_PORT_MODE_DATA_WO_TS_M GENMASK(6, 5) +#define SYS_PORT_MODE_DATA_WO_TS_X(x) (((x) & GENMASK(6, 5)) >> 5) +#define SYS_PORT_MODE_INCL_INJ_HDR(x) (((x) << 3) & GENMASK(4, 3)) +#define SYS_PORT_MODE_INCL_INJ_HDR_M GENMASK(4, 3) +#define SYS_PORT_MODE_INCL_INJ_HDR_X(x) (((x) & GENMASK(4, 3)) >> 3) +#define SYS_PORT_MODE_INCL_XTR_HDR(x) (((x) << 1) & GENMASK(2, 1)) +#define SYS_PORT_MODE_INCL_XTR_HDR_M GENMASK(2, 1) +#define SYS_PORT_MODE_INCL_XTR_HDR_X(x) (((x) & GENMASK(2, 1)) >> 1) +#define SYS_PORT_MODE_INJ_HDR_ERR BIT(0) + +#define SYS_FRONT_PORT_MODE_RSZ 0x4 + +#define SYS_FRONT_PORT_MODE_HDX_MODE BIT(0) + +#define SYS_FRM_AGING_AGE_TX_ENA BIT(20) +#define SYS_FRM_AGING_MAX_AGE(x) ((x) & GENMASK(19, 0)) +#define SYS_FRM_AGING_MAX_AGE_M GENMASK(19, 0) + +#define SYS_STAT_CFG_STAT_CLEAR_SHOT(x) (((x) << 10) & GENMASK(16, 10)) +#define SYS_STAT_CFG_STAT_CLEAR_SHOT_M GENMASK(16, 10) +#define SYS_STAT_CFG_STAT_CLEAR_SHOT_X(x) (((x) & GENMASK(16, 10)) >> 10) +#define SYS_STAT_CFG_STAT_VIEW(x) ((x) & GENMASK(9, 0)) +#define SYS_STAT_CFG_STAT_VIEW_M GENMASK(9, 0) + +#define SYS_SW_STATUS_RSZ 0x4 + +#define SYS_SW_STATUS_PORT_RX_PAUSED BIT(0) + +#define SYS_MISC_CFG_PTP_RSRV_CLR BIT(1) +#define SYS_MISC_CFG_PTP_DIS_NEG_RO BIT(0) + +#define SYS_REW_MAC_HIGH_CFG_RSZ 0x4 + +#define SYS_REW_MAC_LOW_CFG_RSZ 0x4 + +#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG(x) (((x) << 6) & GENMASK(21, 6)) +#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG_M GENMASK(21, 6) +#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG_X(x) (((x) & GENMASK(21, 6)) >> 6) +#define SYS_TIMESTAMP_OFFSET_TIMESTAMP_OFFSET(x) ((x) & GENMASK(5, 0)) +#define SYS_TIMESTAMP_OFFSET_TIMESTAMP_OFFSET_M GENMASK(5, 0) + +#define SYS_PAUSE_CFG_RSZ 0x4 + +#define SYS_PAUSE_CFG_PAUSE_START(x) (((x) << 10) & GENMASK(18, 10)) +#define SYS_PAUSE_CFG_PAUSE_START_M GENMASK(18, 10) +#define SYS_PAUSE_CFG_PAUSE_START_X(x) (((x) & GENMASK(18, 10)) >> 10) +#define SYS_PAUSE_CFG_PAUSE_STOP(x) (((x) << 1) & GENMASK(9, 1)) +#define SYS_PAUSE_CFG_PAUSE_STOP_M GENMASK(9, 1) +#define SYS_PAUSE_CFG_PAUSE_STOP_X(x) (((x) & GENMASK(9, 1)) >> 1) +#define SYS_PAUSE_CFG_PAUSE_ENA BIT(0) + +#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START(x) (((x) << 9) & GENMASK(17, 9)) +#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START_M GENMASK(17, 9) +#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START_X(x) (((x) & GENMASK(17, 9)) >> 9) +#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_STOP(x) ((x) & GENMASK(8, 0)) +#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_STOP_M GENMASK(8, 0) + +#define SYS_ATOP_RSZ 0x4 + +#define SYS_MAC_FC_CFG_RSZ 0x4 + +#define SYS_MAC_FC_CFG_FC_LINK_SPEED(x) (((x) << 26) & GENMASK(27, 26)) +#define SYS_MAC_FC_CFG_FC_LINK_SPEED_M GENMASK(27, 26) +#define SYS_MAC_FC_CFG_FC_LINK_SPEED_X(x) (((x) & GENMASK(27, 26)) >> 26) +#define SYS_MAC_FC_CFG_FC_LATENCY_CFG(x) (((x) << 20) & GENMASK(25, 20)) +#define SYS_MAC_FC_CFG_FC_LATENCY_CFG_M GENMASK(25, 20) +#define SYS_MAC_FC_CFG_FC_LATENCY_CFG_X(x) (((x) & GENMASK(25, 20)) >> 20) +#define SYS_MAC_FC_CFG_ZERO_PAUSE_ENA BIT(18) +#define SYS_MAC_FC_CFG_TX_FC_ENA BIT(17) +#define SYS_MAC_FC_CFG_RX_FC_ENA BIT(16) +#define SYS_MAC_FC_CFG_PAUSE_VAL_CFG(x) ((x) & GENMASK(15, 0)) +#define SYS_MAC_FC_CFG_PAUSE_VAL_CFG_M GENMASK(15, 0) + +#define SYS_MMGT_RELCNT(x) (((x) << 16) & GENMASK(31, 16)) +#define SYS_MMGT_RELCNT_M GENMASK(31, 16) +#define SYS_MMGT_RELCNT_X(x) (((x) & GENMASK(31, 16)) >> 16) +#define SYS_MMGT_FREECNT(x) ((x) & GENMASK(15, 0)) +#define SYS_MMGT_FREECNT_M GENMASK(15, 0) + +#define SYS_MMGT_FAST_FREEVLD(x) (((x) << 4) & GENMASK(7, 4)) +#define SYS_MMGT_FAST_FREEVLD_M GENMASK(7, 4) +#define SYS_MMGT_FAST_FREEVLD_X(x) (((x) & GENMASK(7, 4)) >> 4) +#define SYS_MMGT_FAST_RELVLD(x) ((x) & GENMASK(3, 0)) +#define SYS_MMGT_FAST_RELVLD_M GENMASK(3, 0) + +#define SYS_EVENTS_DIF_RSZ 0x4 + +#define SYS_EVENTS_DIF_EV_DRX(x) (((x) << 6) & GENMASK(8, 6)) +#define SYS_EVENTS_DIF_EV_DRX_M GENMASK(8, 6) +#define SYS_EVENTS_DIF_EV_DRX_X(x) (((x) & GENMASK(8, 6)) >> 6) +#define SYS_EVENTS_DIF_EV_DTX(x) ((x) & GENMASK(5, 0)) +#define SYS_EVENTS_DIF_EV_DTX_M GENMASK(5, 0) + +#define SYS_EVENTS_CORE_EV_FWR BIT(2) +#define SYS_EVENTS_CORE_EV_ANA(x) ((x) & GENMASK(1, 0)) +#define SYS_EVENTS_CORE_EV_ANA_M GENMASK(1, 0) + +#define SYS_CNT_GSZ 0x4 + +#define SYS_PTP_STATUS_PTP_TXSTAMP_OAM BIT(29) +#define SYS_PTP_STATUS_PTP_OVFL BIT(28) +#define SYS_PTP_STATUS_PTP_MESS_VLD BIT(27) +#define SYS_PTP_STATUS_PTP_MESS_ID(x) (((x) << 21) & GENMASK(26, 21)) +#define SYS_PTP_STATUS_PTP_MESS_ID_M GENMASK(26, 21) +#define SYS_PTP_STATUS_PTP_MESS_ID_X(x) (((x) & GENMASK(26, 21)) >> 21) +#define SYS_PTP_STATUS_PTP_MESS_TXPORT(x) (((x) << 16) & GENMASK(20, 16)) +#define SYS_PTP_STATUS_PTP_MESS_TXPORT_M GENMASK(20, 16) +#define SYS_PTP_STATUS_PTP_MESS_TXPORT_X(x) (((x) & GENMASK(20, 16)) >> 16) +#define SYS_PTP_STATUS_PTP_MESS_SEQ_ID(x) ((x) & GENMASK(15, 0)) +#define SYS_PTP_STATUS_PTP_MESS_SEQ_ID_M GENMASK(15, 0) + +#define SYS_PTP_TXSTAMP_PTP_TXSTAMP(x) ((x) & GENMASK(29, 0)) +#define SYS_PTP_TXSTAMP_PTP_TXSTAMP_M GENMASK(29, 0) +#define SYS_PTP_TXSTAMP_PTP_TXSTAMP_SEC BIT(31) + +#define SYS_PTP_NXT_PTP_NXT BIT(0) + +#define SYS_PTP_CFG_PTP_STAMP_WID(x) (((x) << 2) & GENMASK(7, 2)) +#define SYS_PTP_CFG_PTP_STAMP_WID_M GENMASK(7, 2) +#define SYS_PTP_CFG_PTP_STAMP_WID_X(x) (((x) & GENMASK(7, 2)) >> 2) +#define SYS_PTP_CFG_PTP_CF_ROLL_MODE(x) ((x) & GENMASK(1, 0)) +#define SYS_PTP_CFG_PTP_CF_ROLL_MODE_M GENMASK(1, 0) + +#define SYS_RAM_INIT_RAM_INIT BIT(1) +#define SYS_RAM_INIT_RAM_CFG_HOOK BIT(0) + +#endif -- cgit v1.2.3 From 8dce89aa5f3274e7c26132433840f63d129406bb Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 14 Nov 2019 17:03:29 +0200 Subject: net: dsa: ocelot: add tagger for Ocelot/Felix switches While it is entirely possible that this tagger format is in fact more generic than just these 2 switch families, I don't have that knowledge. The Seville switch in NXP T1040 has a similar frame format, but there are enough differences (e.g. DEST field starts at bit 57 instead of 56) that calling this file tag_vitesse.c is a bit of a stretch at the moment. The frame format has been listed in a comment so that people who add support for further Vitesse switches can rework this tagger while keeping compatibility with Felix. The "ocelot" name was chosen instead of "felix" because even the Ocelot switch can act as a DSA device when it is used in NPI mode, and the Felix tagger format is almost identical. Currently it is only used for the Felix switch embedded in the NXP LS1028A chip. The ABI for this tagger should be considered "not stable" at the moment. The DSA tag is always placed before the Ethernet header and therefore, we are using the long prefix for RX tags to avoid putting the DSA master port in promiscuous mode. Once there will be an API in DSA for drivers to request DSA masters to be in promiscuous mode unconditionally, we will switch to the "no prefix" extraction frame header, which will save 16 padding bytes for each RX frame. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9507611a41f0..6767dc3f66c0 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -42,6 +42,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_8021Q_VALUE 12 #define DSA_TAG_PROTO_SJA1105_VALUE 13 #define DSA_TAG_PROTO_KSZ8795_VALUE 14 +#define DSA_TAG_PROTO_OCELOT_VALUE 15 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -59,6 +60,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, DSA_TAG_PROTO_KSZ8795 = DSA_TAG_PROTO_KSZ8795_VALUE, + DSA_TAG_PROTO_OCELOT = DSA_TAG_PROTO_OCELOT_VALUE, }; struct packet_type; -- cgit v1.2.3 From b7b3fc8dd95bc02bd30680da258e09dda55270db Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 15 Nov 2019 13:37:22 +0100 Subject: bpf: Support doubleword alignment in bpf_jit_binary_alloc Currently passing alignment greater than 4 to bpf_jit_binary_alloc does not work: in such cases it silently aligns only to 4 bytes. On s390, in order to load a constant from memory in a large (>512k) BPF program, one must use lgrl instruction, whose memory operand must be aligned on an 8-byte boundary. This patch makes it possible to request 8-byte alignment from bpf_jit_binary_alloc, and also makes it issue a warning when an unsupported alignment is requested. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191115123722.58462-1-iii@linux.ibm.com --- include/linux/filter.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7a6f8f6f1da4..ad80e9c6111c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -515,10 +515,12 @@ struct sock_fprog_kern { struct sock_filter *filter; }; +/* Some arches need doubleword alignment for their instructions and/or data */ +#define BPF_IMAGE_ALIGNMENT 8 + struct bpf_binary_header { u32 pages; - /* Some arches need word alignment for their instructions */ - u8 image[] __aligned(4); + u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; struct bpf_prog { -- cgit v1.2.3 From 5964b2000f283ff5df366f718e0f083ebbaae977 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:03 -0800 Subject: bpf: Add bpf_arch_text_poke() helper Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch nops/calls in kernel text into calls into BPF trampoline and to patch calls/nops inside BPF programs too. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-4-ast@kernel.org --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7c7f518811a6..8b90db25348a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1157,4 +1157,12 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, } #endif /* CONFIG_INET */ +enum bpf_text_poke_type { + BPF_MOD_NOP_TO_CALL, + BPF_MOD_CALL_TO_CALL, + BPF_MOD_CALL_TO_NOP, +}; +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *addr1, void *addr2); + #endif /* _LINUX_BPF_H */ -- cgit v1.2.3 From fec56f5890d93fc2ed74166c397dc186b1c25951 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:04 -0800 Subject: bpf: Introduce BPF trampoline Introduce BPF trampoline concept to allow kernel code to call into BPF programs with practically zero overhead. The trampoline generation logic is architecture dependent. It's converting native calling convention into BPF calling convention. BPF ISA is 64-bit (even on 32-bit architectures). The registers R1 to R5 are used to pass arguments into BPF functions. The main BPF program accepts only single argument "ctx" in R1. Whereas CPU native calling convention is different. x86-64 is passing first 6 arguments in registers and the rest on the stack. x86-32 is passing first 3 arguments in registers. sparc64 is passing first 6 in registers. And so on. The trampolines between BPF and kernel already exist. BPF_CALL_x macros in include/linux/filter.h statically compile trampolines from BPF into kernel helpers. They convert up to five u64 arguments into kernel C pointers and integers. On 64-bit architectures this BPF_to_kernel trampolines are nops. On 32-bit architecture they're meaningful. The opposite job kernel_to_BPF trampolines is done by CAST_TO_U64 macros and __bpf_trace_##call() shim functions in include/trace/bpf_probe.h. They convert kernel function arguments into array of u64s that BPF program consumes via R1=ctx pointer. This patch set is doing the same job as __bpf_trace_##call() static trampolines, but dynamically for any kernel function. There are ~22k global kernel functions that are attachable via nop at function entry. The function arguments and types are described in BTF. The job of btf_distill_func_proto() function is to extract useful information from BTF into "function model" that architecture dependent trampoline generators will use to generate assembly code to cast kernel function arguments into array of u64s. For example the kernel function eth_type_trans has two pointers. They will be casted to u64 and stored into stack of generated trampoline. The pointer to that stack space will be passed into BPF program in R1. On x86-64 such generated trampoline will consume 16 bytes of stack and two stores of %rdi and %rsi into stack. The verifier will make sure that only two u64 are accessed read-only by BPF program. The verifier will also recognize the precise type of the pointers being accessed and will not allow typecasting of the pointer to a different type within BPF program. The tracing use case in the datacenter demonstrated that certain key kernel functions have (like tcp_retransmit_skb) have 2 or more kprobes that are always active. Other functions have both kprobe and kretprobe. So it is essential to keep both kernel code and BPF programs executing at maximum speed. Hence generated BPF trampoline is re-generated every time new program is attached or detached to maintain maximum performance. To avoid the high cost of retpoline the attached BPF programs are called directly. __bpf_prog_enter/exit() are used to support per-program execution stats. In the future this logic will be optimized further by adding support for bpf_stats_enabled_key inside generated assembly code. Introduction of preemptible and sleepable BPF programs will completely remove the need to call to __bpf_prog_enter/exit(). Detach of a BPF program from the trampoline should not fail. To avoid memory allocation in detach path the half of the page is used as a reserve and flipped after each attach/detach. 2k bytes is enough to call 40+ BPF programs directly which is enough for BPF tracing use cases. This limit can be increased in the future. BPF_TRACE_FENTRY programs have access to raw kernel function arguments while BPF_TRACE_FEXIT programs have access to kernel return value as well. Often kprobe BPF program remembers function arguments in a map while kretprobe fetches arguments from a map and analyzes them together with return value. BPF_TRACE_FEXIT accelerates this typical use case. Recursion prevention for kprobe BPF programs is done via per-cpu bpf_prog_active counter. In practice that turned out to be a mistake. It caused programs to randomly skip execution. The tracing tools missed results they were looking for. Hence BPF trampoline doesn't provide builtin recursion prevention. It's a job of BPF program itself and will be addressed in the follow up patches. BPF trampoline is intended to be used beyond tracing and fentry/fexit use cases in the future. For example to remove retpoline cost from XDP programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-5-ast@kernel.org --- include/linux/bpf.h | 105 +++++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 2 + 2 files changed, 107 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8b90db25348a..0d4c5c224d79 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -14,6 +14,8 @@ #include #include #include +#include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -384,6 +386,100 @@ struct bpf_prog_stats { struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); +struct btf_func_model { + u8 ret_size; + u8 nr_args; + u8 arg_size[MAX_BPF_FUNC_ARGS]; +}; + +/* Restore arguments before returning from trampoline to let original function + * continue executing. This flag is used for fentry progs when there are no + * fexit progs. + */ +#define BPF_TRAMP_F_RESTORE_REGS BIT(0) +/* Call original function after fentry progs, but before fexit progs. + * Makes sense for fentry/fexit, normal calls and indirect calls. + */ +#define BPF_TRAMP_F_CALL_ORIG BIT(1) +/* Skip current frame and return to parent. Makes sense for fentry/fexit + * programs only. Should not be used with normal calls and indirect calls. + */ +#define BPF_TRAMP_F_SKIP_FRAME BIT(2) + +/* Different use cases for BPF trampoline: + * 1. replace nop at the function entry (kprobe equivalent) + * flags = BPF_TRAMP_F_RESTORE_REGS + * fentry = a set of programs to run before returning from trampoline + * + * 2. replace nop at the function entry (kprobe + kretprobe equivalent) + * flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME + * orig_call = fentry_ip + MCOUNT_INSN_SIZE + * fentry = a set of program to run before calling original function + * fexit = a set of program to run after original function + * + * 3. replace direct call instruction anywhere in the function body + * or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid) + * With flags = 0 + * fentry = a set of programs to run before returning from trampoline + * With flags = BPF_TRAMP_F_CALL_ORIG + * orig_call = original callback addr or direct function addr + * fentry = a set of program to run before calling original function + * fexit = a set of program to run after original function + */ +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, + struct bpf_prog **fentry_progs, int fentry_cnt, + struct bpf_prog **fexit_progs, int fexit_cnt, + void *orig_call); +/* these two functions are called from generated trampoline */ +u64 notrace __bpf_prog_enter(void); +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); + +enum bpf_tramp_prog_type { + BPF_TRAMP_FENTRY, + BPF_TRAMP_FEXIT, + BPF_TRAMP_MAX +}; + +struct bpf_trampoline { + /* hlist for trampoline_table */ + struct hlist_node hlist; + /* serializes access to fields of this trampoline */ + struct mutex mutex; + refcount_t refcnt; + u64 key; + struct { + struct btf_func_model model; + void *addr; + } func; + /* list of BPF programs using this trampoline */ + struct hlist_head progs_hlist[BPF_TRAMP_MAX]; + /* Number of attached programs. A counter per kind. */ + int progs_cnt[BPF_TRAMP_MAX]; + /* Executable image of trampoline */ + void *image; + u64 selector; +}; +#ifdef CONFIG_BPF_JIT +struct bpf_trampoline *bpf_trampoline_lookup(u64 key); +int bpf_trampoline_link_prog(struct bpf_prog *prog); +int bpf_trampoline_unlink_prog(struct bpf_prog *prog); +void bpf_trampoline_put(struct bpf_trampoline *tr); +#else +static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +{ + return NULL; +} +static inline int bpf_trampoline_link_prog(struct bpf_prog *prog) +{ + return -ENOTSUPP; +} +static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog) +{ + return -ENOTSUPP; +} +static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} +#endif + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -398,6 +494,9 @@ struct bpf_prog_aux { bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + enum bpf_tramp_prog_type trampoline_prog_type; + struct bpf_trampoline *trampoline; + struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; /* function name for valid attach_btf_id */ @@ -784,6 +883,12 @@ int btf_struct_access(struct bpf_verifier_log *log, u32 *next_btf_id); u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int); +int btf_distill_func_proto(struct bpf_verifier_log *log, + struct btf *btf, + const struct btf_type *func_proto, + const char *func_name, + struct btf_func_model *m); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index df6809a76404..69c200e6e696 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -201,6 +201,8 @@ enum bpf_attach_type { BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, BPF_TRACE_RAW_TP, + BPF_TRACE_FENTRY, + BPF_TRACE_FEXIT, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 9cc31b3a092d9bf2a18f09ad77e727ddb42a5b1e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:14 -0800 Subject: bpf: Fix race in btf_resolve_helper_id() btf_resolve_helper_id() caching logic is a bit racy, since under root the verifier can verify several programs in parallel. Fix it with READ/WRITE_ONCE. Fix the type as well, since error is also recorded. Fixes: a7658e1a4164 ("bpf: Check types of arguments passed into helpers") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-15-ast@kernel.org --- include/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0d4c5c224d79..cb5a356381f5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -248,7 +248,7 @@ struct bpf_func_proto { }; enum bpf_arg_type arg_type[5]; }; - u32 *btf_id; /* BTF ids of arguments */ + int *btf_id; /* BTF ids of arguments */ }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -881,7 +881,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); -u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int); +int btf_resolve_helper_id(struct bpf_verifier_log *log, + const struct bpf_func_proto *fn, int); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, -- cgit v1.2.3 From 5c27d8d76ce810c6254cf5917a6019d824f34bd2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 13 Nov 2019 14:08:01 +0100 Subject: netfilter: nf_flow_table_offload: add IPv6 support Add nf_flow_rule_route_ipv6() and use it from the IPv6 and the inet flowtable type definitions. Rename the nf_flow_rule_route() function to nf_flow_rule_route_ipv4(). Adjust maximum number of actions, which now becomes 16 to leave sufficient room for the IPv6 address mangling for NAT. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index eea66de328d3..f0897b3c97fb 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -163,9 +163,12 @@ void nf_flow_table_offload_flush(struct nf_flowtable *flowtable); int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, struct net_device *dev, enum flow_block_command cmd); -int nf_flow_rule_route(struct net *net, const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule); +int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); +int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); -- cgit v1.2.3 From 91cc1a99740e2ed1d903b5906afb470cc5a07379 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:15 -0800 Subject: bpf: Annotate context types Annotate BPF program context types with program-side type and kernel-side type. This type information is used by the verifier. btf_get_prog_ctx_type() is used in the later patches to verify that BTF type of ctx in BPF program matches to kernel expected ctx type. For example, the XDP program type is: BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp, struct xdp_md, struct xdp_buff) That means that XDP program should be written as: int xdp_prog(struct xdp_md *ctx) { ... } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-16-ast@kernel.org --- include/linux/bpf.h | 11 ++++++- include/linux/bpf_types.h | 78 +++++++++++++++++++++++++++++++---------------- 2 files changed, 62 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cb5a356381f5..9c48f11fe56e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -747,7 +747,7 @@ DECLARE_PER_CPU(int, bpf_prog_active); extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; -#define BPF_PROG_TYPE(_id, _name) \ +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ @@ -1213,6 +1213,15 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, #endif #ifdef CONFIG_INET +struct sk_reuseport_kern { + struct sk_buff *skb; + struct sock *sk; + struct sock *selected_sk; + void *data_end; + u32 hash; + u32 reuseport_id; + bool bind_inany; +}; bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index de14872b01ba..93740b3614d7 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -2,42 +2,68 @@ /* internal file - do not include directly */ #ifdef CONFIG_NET -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) -BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp, + struct xdp_md, struct xdp_buff) #ifdef CONFIG_CGROUP_BPF -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock, + struct bpf_sock, struct sock) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr, + struct bpf_sock_addr, struct bpf_sock_addr_kern) #endif -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) -BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops, + struct bpf_sock_ops, struct bpf_sock_ops_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg, + struct sk_msg_md, struct sk_msg) +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector, + struct __sk_buff, struct bpf_flow_dissector) #endif #ifdef CONFIG_BPF_EVENTS -BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) -BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) -BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) -BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) -BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) -BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing) +BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe, + bpf_user_pt_regs_t, struct pt_regs) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint, + __u64, u64) +BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event, + struct bpf_perf_event_data, struct bpf_perf_event_data_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint, + struct bpf_raw_tracepoint_args, u64) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable, + struct bpf_raw_tracepoint_args, u64) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing, + void *, void *) #endif #ifdef CONFIG_CGROUP_BPF -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev, + struct bpf_cgroup_dev_ctx, struct bpf_cgroup_dev_ctx) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl, + struct bpf_sysctl, struct bpf_sysctl_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt, + struct bpf_sockopt, struct bpf_sockopt_kern) #endif #ifdef CONFIG_BPF_LIRC_MODE2 -BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) +BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, + __u32, u32) #endif #ifdef CONFIG_INET -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport, + struct sk_reuseport_md, struct sk_reuseport_kern) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) -- cgit v1.2.3 From 8c1b6e69dcc1e11bd24111e3734dd740aaf3fda1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:16 -0800 Subject: bpf: Compare BTF types of functions arguments with actual types Make the verifier check that BTF types of function arguments match actual types passed into top-level BPF program and into BPF-to-BPF calls. If types match such BPF programs and sub-programs will have full support of BPF trampoline. If types mismatch the trampoline has to be conservative. It has to save/restore five program arguments and assume 64-bit scalars. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-17-ast@kernel.org --- include/linux/bpf.h | 8 ++++++++ include/linux/bpf_verifier.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9c48f11fe56e..c70bf04726b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -480,6 +480,10 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog) static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} #endif +struct bpf_func_info_aux { + bool unreliable; +}; + struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; @@ -494,6 +498,7 @@ struct bpf_prog_aux { bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + bool func_proto_unreliable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; @@ -518,6 +523,7 @@ struct bpf_prog_aux { struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; + struct bpf_func_info_aux *func_info_aux; /* bpf_line_info loaded from userspace. linfo->insn_off * has the xlated insn offset. * Both the main and sub prog share the same linfo. @@ -890,6 +896,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); +int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6e7284ea1468..cdd08bf0ec06 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -343,6 +343,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_info { + /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ -- cgit v1.2.3 From 5b92a28aae4dd0f88778d540ecfdcdaec5a41723 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:17 -0800 Subject: bpf: Support attaching tracing BPF program to other BPF programs Allow FENTRY/FEXIT BPF programs to attach to other BPF programs of any type including their subprograms. This feature allows snooping on input and output packets in XDP, TC programs including their return values. In order to do that the verifier needs to track types not only of vmlinux, but types of other BPF programs as well. The verifier also needs to translate uapi/linux/bpf.h types used by networking programs into kernel internal BTF types used by FENTRY/FEXIT BPF programs. In some cases LLVM optimizations can remove arguments from BPF subprograms without adjusting BTF info that LLVM backend knows. When BTF info disagrees with actual types that the verifiers sees the BPF trampoline has to fallback to conservative and treat all arguments as u64. The FENTRY/FEXIT program can still attach to such subprograms, but it won't be able to recognize pointer types like 'struct sk_buff *' and it won't be able to pass them to bpf_skb_output() for dumping packets to user space. The FENTRY/FEXIT program would need to use bpf_probe_read_kernel() instead. The BPF_PROG_LOAD command is extended with attach_prog_fd field. When it's set to zero the attach_btf_id is one vmlinux BTF type ids. When attach_prog_fd points to previously loaded BPF program the attach_btf_id is BTF type id of main function or one of its subprograms. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-18-ast@kernel.org --- include/linux/bpf.h | 1 + include/linux/btf.h | 1 + include/uapi/linux/bpf.h | 1 + 3 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c70bf04726b4..5b81cde47314 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -495,6 +495,7 @@ struct bpf_prog_aux { u32 func_cnt; /* used by non-func prog as the number of func progs */ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + struct bpf_prog *linked_prog; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ diff --git a/include/linux/btf.h b/include/linux/btf.h index 9dee00859c5f..79d4abc2556a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -88,6 +88,7 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t) const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); +struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69c200e6e696..4842a134b202 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -425,6 +425,7 @@ union bpf_attr { __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + __u32 attach_prog_fd; /* 0 to attach to vmlinux */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From 49cb2fc42ce4b7a656ee605e30c302efaa39c1a7 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 15 Nov 2019 13:36:20 +0100 Subject: fork: extend clone3() to support setting a PID The main motivation to add set_tid to clone3() is CRIU. To restore a process with the same PID/TID CRIU currently uses /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to ns_last_pid and then (quickly) does a clone(). This works most of the time, but it is racy. It is also slow as it requires multiple syscalls. Extending clone3() to support *set_tid makes it possible restore a process using CRIU without accessing /proc/sys/kernel/ns_last_pid and race free (as long as the desired PID/TID is available). This clone3() extension places the same restrictions (CAP_SYS_ADMIN) on clone3() with *set_tid as they are currently in place for ns_last_pid. The original version of this change was using a single value for set_tid. At the 2019 LPC, after presenting set_tid, it was, however, decided to change set_tid to an array to enable setting the PID of a process in multiple PID namespaces at the same time. If a process is created in a PID namespace it is possible to influence the PID inside and outside of the PID namespace. Details also in the corresponding selftest. To create a process with the following PIDs: PID NS level Requested PID 0 (host) 31496 1 42 2 1 For that example the two newly introduced parameters to struct clone_args (set_tid and set_tid_size) would need to be: set_tid[0] = 1; set_tid[1] = 42; set_tid[2] = 31496; set_tid_size = 3; If only the PIDs of the two innermost nested PID namespaces should be defined it would look like this: set_tid[0] = 1; set_tid[1] = 42; set_tid_size = 2; The PID of the newly created process would then be the next available free PID in the PID namespace level 0 (host) and 42 in the PID namespace at level 1 and the PID of the process in the innermost PID namespace would be 1. The set_tid array is used to specify the PID of a process starting from the innermost nested PID namespaces up to set_tid_size PID namespaces. set_tid_size cannot be larger then the current PID namespace level. Signed-off-by: Adrian Reber Reviewed-by: Christian Brauner Reviewed-by: Oleg Nesterov Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Acked-by: Andrei Vagin Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com Signed-off-by: Christian Brauner --- include/linux/pid.h | 3 ++- include/linux/pid_namespace.h | 2 ++ include/linux/sched/task.h | 3 +++ include/uapi/linux/sched.h | 53 ++++++++++++++++++++++++++++--------------- 4 files changed, 42 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/pid.h b/include/linux/pid.h index 034e3cd60dc0..998ae7d24450 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -124,7 +124,8 @@ extern struct pid *find_vpid(int nr); extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); -extern struct pid *alloc_pid(struct pid_namespace *ns); +extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, + size_t set_tid_size); extern void free_pid(struct pid *pid); extern void disable_pid_allocation(struct pid_namespace *ns); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 49538b172483..2ed6af88794b 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -12,6 +12,8 @@ #include #include +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 struct fs_pin; diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 4b1c3b664f51..f1879884238e 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -26,6 +26,9 @@ struct kernel_clone_args { unsigned long stack; unsigned long stack_size; unsigned long tls; + pid_t *set_tid; + /* Number of elements in *set_tid */ + size_t set_tid_size; }; /* diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 1d500ed03c63..a0b1c224c72b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -39,24 +39,38 @@ #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall - * @flags: Flags for the new process as listed above. - * All flags are valid except for CSIGNAL and - * CLONE_DETACHED. - * @pidfd: If CLONE_PIDFD is set, a pidfd will be - * returned in this argument. - * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the - * child process will be returned in the child's - * memory. - * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of - * the child process will be returned in the - * parent's memory. - * @exit_signal: The exit_signal the parent process will be - * sent when the child exits. - * @stack: Specify the location of the stack for the - * child process. - * @stack_size: The size of the stack for the child process. - * @tls: If CLONE_SETTLS is set, the tls descriptor - * is set to tls. + * @flags: Flags for the new process as listed above. + * All flags are valid except for CSIGNAL and + * CLONE_DETACHED. + * @pidfd: If CLONE_PIDFD is set, a pidfd will be + * returned in this argument. + * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the + * child process will be returned in the child's + * memory. + * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of + * the child process will be returned in the + * parent's memory. + * @exit_signal: The exit_signal the parent process will be + * sent when the child exits. + * @stack: Specify the location of the stack for the + * child process. + * @stack_size: The size of the stack for the child process. + * @tls: If CLONE_SETTLS is set, the tls descriptor + * is set to tls. + * @set_tid: Pointer to an array of type *pid_t. The size + * of the array is defined using @set_tid_size. + * This array is used to select PIDs/TIDs for + * newly created processes. The first element in + * this defines the PID in the most nested PID + * namespace. Each additional element in the array + * defines the PID in the parent PID namespace of + * the original PID namespace. If the array has + * less entries than the number of currently + * nested PID namespaces only the PIDs in the + * corresponding namespaces are set. + * @set_tid_size: This defines the size of the array referenced + * in @set_tid. This cannot be larger than the + * kernel's limit of nested PID namespaces. * * The structure is versioned by size and thus extensible. * New struct members must go at the end of the struct and @@ -71,10 +85,13 @@ struct clone_args { __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; }; #endif #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ /* * Scheduling policies -- cgit v1.2.3 From c3f812cea0d7006469d1cf33a4a9f0a12bb4b3a3 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Thu, 14 Nov 2019 14:13:00 -0800 Subject: page_pool: do not release pool until inflight == 0. The page pool keeps track of the number of pages in flight, and it isn't safe to remove the pool until all pages are returned. Disallow removing the pool until all pages are back, so the pool is always available for page producers. Make the page pool responsible for its own delayed destruction instead of relying on XDP, so the page pool can be used without the xdp memory model. When all pages are returned, free the pool and notify xdp if the pool is registered with the xdp memory system. Have the callback perform a table walk since some drivers (cpsw) may share the pool among multiple xdp_rxq_info. Note that the increment of pages_state_release_cnt may result in inflight == 0, resulting in the pool being released. Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic warning") Signed-off-by: Jonathan Lemon Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 52 +++++++++++++--------------------------------- include/net/xdp_priv.h | 4 ---- include/trace/events/xdp.h | 19 ++++------------- 3 files changed, 18 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 2cbcdbdec254..1121faa99c12 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -70,7 +70,12 @@ struct page_pool_params { struct page_pool { struct page_pool_params p; - u32 pages_state_hold_cnt; + struct delayed_work release_dw; + void (*disconnect)(void *); + unsigned long defer_start; + unsigned long defer_warn; + + u32 pages_state_hold_cnt; /* * Data structure for allocation side @@ -129,25 +134,19 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) struct page_pool *page_pool_create(const struct page_pool_params *params); -void __page_pool_free(struct page_pool *pool); -static inline void page_pool_free(struct page_pool *pool) -{ - /* When page_pool isn't compiled-in, net/core/xdp.c doesn't - * allow registering MEM_TYPE_PAGE_POOL, but shield linker. - */ #ifdef CONFIG_PAGE_POOL - __page_pool_free(pool); -#endif -} - -/* Drivers use this instead of page_pool_free */ +void page_pool_destroy(struct page_pool *pool); +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); +#else static inline void page_pool_destroy(struct page_pool *pool) { - if (!pool) - return; +} - page_pool_free(pool); +static inline void page_pool_use_xdp_mem(struct page_pool *pool, + void (*disconnect)(void *)) +{ } +#endif /* Never call this directly, use helpers below */ void __page_pool_put_page(struct page_pool *pool, @@ -170,24 +169,6 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, __page_pool_put_page(pool, page, true); } -/* API user MUST have disconnected alloc-side (not allowed to call - * page_pool_alloc_pages()) before calling this. The free-side can - * still run concurrently, to handle in-flight packet-pages. - * - * A request to shutdown can fail (with false) if there are still - * in-flight packet-pages. - */ -bool __page_pool_request_shutdown(struct page_pool *pool); -static inline bool page_pool_request_shutdown(struct page_pool *pool) -{ - bool safe_to_remove = false; - -#ifdef CONFIG_PAGE_POOL - safe_to_remove = __page_pool_request_shutdown(pool); -#endif - return safe_to_remove; -} - /* Disconnects a page (from a page_pool). API users can have a need * to disconnect a page (from a page_pool), to allow it to be used as * a regular page (that will eventually be returned to the normal @@ -216,11 +197,6 @@ static inline bool is_page_pool_compiled_in(void) #endif } -static inline void page_pool_get(struct page_pool *pool) -{ - refcount_inc(&pool->user_cnt); -} - static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h index 6a8cba6ea79a..a9d5b7603b89 100644 --- a/include/net/xdp_priv.h +++ b/include/net/xdp_priv.h @@ -12,12 +12,8 @@ struct xdp_mem_allocator { struct page_pool *page_pool; struct zero_copy_allocator *zc_alloc; }; - int disconnect_cnt; - unsigned long defer_start; struct rhash_head node; struct rcu_head rcu; - struct delayed_work defer_wq; - unsigned long defer_warn; }; #endif /* __LINUX_NET_XDP_PRIV_H__ */ diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index c7e3c9c5bad3..a7378bcd9928 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -317,19 +317,15 @@ __MEM_TYPE_MAP(__MEM_TYPE_TP_FN) TRACE_EVENT(mem_disconnect, - TP_PROTO(const struct xdp_mem_allocator *xa, - bool safe_to_remove, bool force), + TP_PROTO(const struct xdp_mem_allocator *xa), - TP_ARGS(xa, safe_to_remove, force), + TP_ARGS(xa), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) - __field(bool, safe_to_remove) - __field(bool, force) - __field(int, disconnect_cnt) ), TP_fast_assign( @@ -337,19 +333,12 @@ TRACE_EVENT(mem_disconnect, __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; - __entry->safe_to_remove = safe_to_remove; - __entry->force = force; - __entry->disconnect_cnt = xa->disconnect_cnt; ), - TP_printk("mem_id=%d mem_type=%s allocator=%p" - " safe_to_remove=%s force=%s disconnect_cnt=%d", + TP_printk("mem_id=%d mem_type=%s allocator=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), - __entry->allocator, - __entry->safe_to_remove ? "true" : "false", - __entry->force ? "true" : "false", - __entry->disconnect_cnt + __entry->allocator ) ); -- cgit v1.2.3 From 5fb8ef25803ef33e2eb60b626435828b937bed75 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Nov 2019 13:22:08 +0100 Subject: crypto: chacha - move existing library code into lib/crypto Currently, our generic ChaCha implementation consists of a permute function in lib/chacha.c that operates on the 64-byte ChaCha state directly [and which is always included into the core kernel since it is used by the /dev/random driver], and the crypto API plumbing to expose it as a skcipher. In order to support in-kernel users that need the ChaCha streamcipher but have no need [or tolerance] for going through the abstractions of the crypto API, let's expose the streamcipher bits via a library API as well, in a way that permits the implementation to be superseded by an architecture specific one if provided. So move the streamcipher code into a separate module in lib/crypto, and expose the init() and crypt() routines to users of the library. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/chacha.h | 77 +++++++++++++++++++++++++++++++--------- include/crypto/internal/chacha.h | 53 +++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 17 deletions(-) create mode 100644 include/crypto/internal/chacha.h (limited to 'include') diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h index d1e723c6a37d..5c662f8fecac 100644 --- a/include/crypto/chacha.h +++ b/include/crypto/chacha.h @@ -15,9 +15,8 @@ #ifndef _CRYPTO_CHACHA_H #define _CRYPTO_CHACHA_H -#include +#include #include -#include /* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */ #define CHACHA_IV_SIZE 16 @@ -29,26 +28,70 @@ /* 192-bit nonce, then 64-bit stream position */ #define XCHACHA_IV_SIZE 32 -struct chacha_ctx { - u32 key[8]; - int nrounds; -}; - -void chacha_block(u32 *state, u8 *stream, int nrounds); +void chacha_block_generic(u32 *state, u8 *stream, int nrounds); static inline void chacha20_block(u32 *state, u8 *stream) { - chacha_block(state, stream, 20); + chacha_block_generic(state, stream, 20); } -void hchacha_block(const u32 *in, u32 *out, int nrounds); -void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv); +void hchacha_block_arch(const u32 *state, u32 *out, int nrounds); +void hchacha_block_generic(const u32 *state, u32 *out, int nrounds); + +static inline void hchacha_block(const u32 *state, u32 *out, int nrounds) +{ + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) + hchacha_block_arch(state, out, nrounds); + else + hchacha_block_generic(state, out, nrounds); +} -int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keysize); -int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keysize); +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv); +static inline void chacha_init_generic(u32 *state, const u32 *key, const u8 *iv) +{ + state[0] = 0x61707865; /* "expa" */ + state[1] = 0x3320646e; /* "nd 3" */ + state[2] = 0x79622d32; /* "2-by" */ + state[3] = 0x6b206574; /* "te k" */ + state[4] = key[0]; + state[5] = key[1]; + state[6] = key[2]; + state[7] = key[3]; + state[8] = key[4]; + state[9] = key[5]; + state[10] = key[6]; + state[11] = key[7]; + state[12] = get_unaligned_le32(iv + 0); + state[13] = get_unaligned_le32(iv + 4); + state[14] = get_unaligned_le32(iv + 8); + state[15] = get_unaligned_le32(iv + 12); +} -int crypto_chacha_crypt(struct skcipher_request *req); -int crypto_xchacha_crypt(struct skcipher_request *req); +static inline void chacha_init(u32 *state, const u32 *key, const u8 *iv) +{ + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) + chacha_init_arch(state, key, iv); + else + chacha_init_generic(state, key, iv); +} + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds); +void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds); + +static inline void chacha_crypt(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) + chacha_crypt_arch(state, dst, src, bytes, nrounds); + else + chacha_crypt_generic(state, dst, src, bytes, nrounds); +} + +static inline void chacha20_crypt(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes) +{ + chacha_crypt(state, dst, src, bytes, 20); +} #endif /* _CRYPTO_CHACHA_H */ diff --git a/include/crypto/internal/chacha.h b/include/crypto/internal/chacha.h new file mode 100644 index 000000000000..c0e40b245431 --- /dev/null +++ b/include/crypto/internal/chacha.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _CRYPTO_INTERNAL_CHACHA_H +#define _CRYPTO_INTERNAL_CHACHA_H + +#include +#include +#include + +struct chacha_ctx { + u32 key[8]; + int nrounds; +}; + +void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv); + +static inline int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keysize, int nrounds) +{ + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + int i; + + if (keysize != CHACHA_KEY_SIZE) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(ctx->key); i++) + ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32)); + + ctx->nrounds = nrounds; + return 0; +} + +static inline int chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keysize) +{ + return chacha_setkey(tfm, key, keysize, 20); +} + +static int inline chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keysize) +{ + return chacha_setkey(tfm, key, keysize, 12); +} + +int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keysize); +int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keysize); + +int crypto_chacha_crypt(struct skcipher_request *req); +int crypto_xchacha_crypt(struct skcipher_request *req); + +#endif /* _CRYPTO_CHACHA_H */ -- cgit v1.2.3 From 84e03fa39fbe95a5567d43bff458c6d3b3a23ad1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Nov 2019 13:22:10 +0100 Subject: crypto: x86/chacha - expose SIMD ChaCha routine as library function Wire the existing x86 SIMD ChaCha code into the new ChaCha library interface, so that users of the library interface will get the accelerated version when available. Given that calls into the library API will always go through the routines in this module if it is enabled, switch to static keys to select the optimal implementation available (which may be none at all, in which case we defer to the generic implementation for all invocations). Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/chacha.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h index 5c662f8fecac..2676f4fbd4c1 100644 --- a/include/crypto/chacha.h +++ b/include/crypto/chacha.h @@ -25,6 +25,12 @@ #define CHACHA_BLOCK_SIZE 64 #define CHACHAPOLY_IV_SIZE 12 +#ifdef CONFIG_X86_64 +#define CHACHA_STATE_WORDS ((CHACHA_BLOCK_SIZE + 12) / sizeof(u32)) +#else +#define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32)) +#endif + /* 192-bit nonce, then 64-bit stream position */ #define XCHACHA_IV_SIZE 32 -- cgit v1.2.3 From 22cf705360707ced15f9fe5423938f313c7df536 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Nov 2019 13:22:18 +0100 Subject: crypto: chacha - unexport chacha_generic routines Now that all users of generic ChaCha code have moved to the core library, there is no longer a need for the generic ChaCha skcpiher driver to export parts of it implementation for reuse by other drivers. So drop the exports, and make the symbols static. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/internal/chacha.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/crypto/internal/chacha.h b/include/crypto/internal/chacha.h index c0e40b245431..aa5d4a16aac5 100644 --- a/include/crypto/internal/chacha.h +++ b/include/crypto/internal/chacha.h @@ -12,8 +12,6 @@ struct chacha_ctx { int nrounds; }; -void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv); - static inline int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize, int nrounds) { @@ -42,12 +40,4 @@ static int inline chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key, return chacha_setkey(tfm, key, keysize, 12); } -int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keysize); -int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key, - unsigned int keysize); - -int crypto_chacha_crypt(struct skcipher_request *req); -int crypto_xchacha_crypt(struct skcipher_request *req); - #endif /* _CRYPTO_CHACHA_H */ -- cgit v1.2.3 From 48ea8c6ebc96bc0990e12ee1c43d0832c23576bb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Nov 2019 13:22:19 +0100 Subject: crypto: poly1305 - move core routines into a separate library Move the core Poly1305 routines shared between the generic Poly1305 shash driver and the Adiantum and NHPoly1305 drivers into a separate library so that using just this pieces does not pull in the crypto API pieces of the generic Poly1305 routine. In a subsequent patch, we will augment this generic library with init/update/final routines so that Poyl1305 algorithm can be used directly without the need for using the crypto API's shash abstraction. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/internal/poly1305.h | 67 ++++++++++++++++++++++++++++++++++++++ include/crypto/poly1305.h | 23 ------------- 2 files changed, 67 insertions(+), 23 deletions(-) create mode 100644 include/crypto/internal/poly1305.h (limited to 'include') diff --git a/include/crypto/internal/poly1305.h b/include/crypto/internal/poly1305.h new file mode 100644 index 000000000000..cb58e61f73a7 --- /dev/null +++ b/include/crypto/internal/poly1305.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common values for the Poly1305 algorithm + */ + +#ifndef _CRYPTO_INTERNAL_POLY1305_H +#define _CRYPTO_INTERNAL_POLY1305_H + +#include +#include +#include + +struct shash_desc; + +/* + * Poly1305 core functions. These implement the ε-almost-∆-universal hash + * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce + * ("s key") at the end. They also only support block-aligned inputs. + */ +void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key); +static inline void poly1305_core_init(struct poly1305_state *state) +{ + *state = (struct poly1305_state){}; +} + +void poly1305_core_blocks(struct poly1305_state *state, + const struct poly1305_key *key, const void *src, + unsigned int nblocks, u32 hibit); +void poly1305_core_emit(const struct poly1305_state *state, void *dst); + +/* Crypto API helper functions for the Poly1305 MAC */ +int crypto_poly1305_init(struct shash_desc *desc); + +int crypto_poly1305_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen); +int crypto_poly1305_final(struct shash_desc *desc, u8 *dst); + +/* + * Poly1305 requires a unique key for each tag, which implies that we can't set + * it on the tfm that gets accessed by multiple users simultaneously. Instead we + * expect the key as the first 32 bytes in the update() call. + */ +static inline +unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, + const u8 *src, unsigned int srclen) +{ + if (!dctx->sset) { + if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { + poly1305_core_setkey(&dctx->r, src); + src += POLY1305_BLOCK_SIZE; + srclen -= POLY1305_BLOCK_SIZE; + dctx->rset = true; + } + if (srclen >= POLY1305_BLOCK_SIZE) { + dctx->s[0] = get_unaligned_le32(src + 0); + dctx->s[1] = get_unaligned_le32(src + 4); + dctx->s[2] = get_unaligned_le32(src + 8); + dctx->s[3] = get_unaligned_le32(src + 12); + src += POLY1305_BLOCK_SIZE; + srclen -= POLY1305_BLOCK_SIZE; + dctx->sset = true; + } + } + return srclen; +} + +#endif diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h index 34317ed2071e..f5a4319c2a1f 100644 --- a/include/crypto/poly1305.h +++ b/include/crypto/poly1305.h @@ -38,27 +38,4 @@ struct poly1305_desc_ctx { bool sset; }; -/* - * Poly1305 core functions. These implement the ε-almost-∆-universal hash - * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce - * ("s key") at the end. They also only support block-aligned inputs. - */ -void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key); -static inline void poly1305_core_init(struct poly1305_state *state) -{ - memset(state->h, 0, sizeof(state->h)); -} -void poly1305_core_blocks(struct poly1305_state *state, - const struct poly1305_key *key, - const void *src, unsigned int nblocks); -void poly1305_core_emit(const struct poly1305_state *state, void *dst); - -/* Crypto API helper functions for the Poly1305 MAC */ -int crypto_poly1305_init(struct shash_desc *desc); -unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, - const u8 *src, unsigned int srclen); -int crypto_poly1305_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen); -int crypto_poly1305_final(struct shash_desc *desc, u8 *dst); - #endif -- cgit v1.2.3 From ad8f5b88383ea685f2b8df2a12ee3e08089a1287 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Nov 2019 13:22:20 +0100 Subject: crypto: x86/poly1305 - unify Poly1305 state struct with generic code In preparation of exposing a Poly1305 library interface directly from the accelerated x86 driver, align the state descriptor of the x86 code with the one used by the generic driver. This is needed to make the library interface unified between all implementations. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/internal/poly1305.h | 4 ++-- include/crypto/poly1305.h | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/crypto/internal/poly1305.h b/include/crypto/internal/poly1305.h index cb58e61f73a7..04fa269e5534 100644 --- a/include/crypto/internal/poly1305.h +++ b/include/crypto/internal/poly1305.h @@ -46,10 +46,10 @@ unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx, { if (!dctx->sset) { if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) { - poly1305_core_setkey(&dctx->r, src); + poly1305_core_setkey(dctx->r, src); src += POLY1305_BLOCK_SIZE; srclen -= POLY1305_BLOCK_SIZE; - dctx->rset = true; + dctx->rset = 1; } if (srclen >= POLY1305_BLOCK_SIZE) { dctx->s[0] = get_unaligned_le32(src + 0); diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h index f5a4319c2a1f..36b5886cb50c 100644 --- a/include/crypto/poly1305.h +++ b/include/crypto/poly1305.h @@ -22,20 +22,20 @@ struct poly1305_state { }; struct poly1305_desc_ctx { - /* key */ - struct poly1305_key r; - /* finalize key */ - u32 s[4]; - /* accumulator */ - struct poly1305_state h; /* partial buffer */ u8 buf[POLY1305_BLOCK_SIZE]; /* bytes used in partial buffer */ unsigned int buflen; - /* r key has been set */ - bool rset; - /* s key has been set */ + /* how many keys have been set in r[] */ + unsigned short rset; + /* whether s[] has been set */ bool sset; + /* finalize key */ + u32 s[4]; + /* accumulator */ + struct poly1305_state h; + /* key */ + struct poly1305_key r[1]; }; #endif -- cgit v1.2.3 From a1d93064094cc5e24d64e35cf093e7191d0c9344 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Nov 2019 13:22:21 +0100 Subject: crypto: poly1305 - expose init/update/final library interface Expose the existing generic Poly1305 code via a init/update/final library interface so that callers are not required to go through the crypto API's shash abstraction to access it. At the same time, make some preparations so that the library implementation can be superseded by an accelerated arch-specific version in the future. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/poly1305.h | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h index 36b5886cb50c..74c6e1cd73ee 100644 --- a/include/crypto/poly1305.h +++ b/include/crypto/poly1305.h @@ -35,7 +35,43 @@ struct poly1305_desc_ctx { /* accumulator */ struct poly1305_state h; /* key */ - struct poly1305_key r[1]; + struct poly1305_key r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE]; }; +void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key); +void poly1305_init_generic(struct poly1305_desc_ctx *desc, const u8 *key); + +static inline void poly1305_init(struct poly1305_desc_ctx *desc, const u8 *key) +{ + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) + poly1305_init_arch(desc, key); + else + poly1305_init_generic(desc, key); +} + +void poly1305_update_arch(struct poly1305_desc_ctx *desc, const u8 *src, + unsigned int nbytes); +void poly1305_update_generic(struct poly1305_desc_ctx *desc, const u8 *src, + unsigned int nbytes); + +static inline void poly1305_update(struct poly1305_desc_ctx *desc, + const u8 *src, unsigned int nbytes) +{ + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) + poly1305_update_arch(desc, src, nbytes); + else + poly1305_update_generic(desc, src, nbytes); +} + +void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest); +void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *digest); + +static inline void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest) +{ + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305)) + poly1305_final_arch(desc, digest); + else + poly1305_final_generic(desc, digest); +} + #endif -- cgit v1.2.3 From 1b2c6a5120489d41c8ea3b8dacd0b4586289b158 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Nov 2019 13:22:22 +0100 Subject: crypto: x86/poly1305 - depend on generic library not generic shash Remove the dependency on the generic Poly1305 driver. Instead, depend on the generic library so that we only reuse code without pulling in the generic skcipher implementation as well. While at it, remove the logic that prefers the non-SIMD path for short inputs - this is no longer necessary after recent FPU handling changes on x86. Since this removes the last remaining user of the routines exported by the generic shash driver, unexport them and make them static. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/internal/poly1305.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/crypto/internal/poly1305.h b/include/crypto/internal/poly1305.h index 04fa269e5534..479b0cab2a1a 100644 --- a/include/crypto/internal/poly1305.h +++ b/include/crypto/internal/poly1305.h @@ -10,8 +10,6 @@ #include #include -struct shash_desc; - /* * Poly1305 core functions. These implement the ε-almost-∆-universal hash * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce @@ -28,13 +26,6 @@ void poly1305_core_blocks(struct poly1305_state *state, unsigned int nblocks, u32 hibit); void poly1305_core_emit(const struct poly1305_state *state, void *dst); -/* Crypto API helper functions for the Poly1305 MAC */ -int crypto_poly1305_init(struct shash_desc *desc); - -int crypto_poly1305_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen); -int crypto_poly1305_final(struct shash_desc *desc, u8 *dst); - /* * Poly1305 requires a unique key for each tag, which implies that we can't set * it on the tfm that gets accessed by multiple users simultaneously. Instead we -- cgit v1.2.3 From 66d7fb94e4ffe5acc589e0b2b4710aecc1f07a28 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 8 Nov 2019 13:22:28 +0100 Subject: crypto: blake2s - generic C library implementation and selftest The C implementation was originally based on Samuel Neves' public domain reference implementation but has since been heavily modified for the kernel. We're able to do compile-time optimizations by moving some scaffolding around the final function into the header file. Information: https://blake2.net/ Signed-off-by: Jason A. Donenfeld Signed-off-by: Samuel Neves Co-developed-by: Samuel Neves [ardb: - move from lib/zinc to lib/crypto - remove simd handling - rewrote selftest for better coverage - use fixed digest length for blake2s_hmac() and rename to blake2s256_hmac() ] Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/blake2s.h | 106 ++++++++++++++++++++++++++++++++++++++ include/crypto/internal/blake2s.h | 19 +++++++ 2 files changed, 125 insertions(+) create mode 100644 include/crypto/blake2s.h create mode 100644 include/crypto/internal/blake2s.h (limited to 'include') diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h new file mode 100644 index 000000000000..b471deac28ff --- /dev/null +++ b/include/crypto/blake2s.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + */ + +#ifndef BLAKE2S_H +#define BLAKE2S_H + +#include +#include +#include + +#include + +enum blake2s_lengths { + BLAKE2S_BLOCK_SIZE = 64, + BLAKE2S_HASH_SIZE = 32, + BLAKE2S_KEY_SIZE = 32, + + BLAKE2S_128_HASH_SIZE = 16, + BLAKE2S_160_HASH_SIZE = 20, + BLAKE2S_224_HASH_SIZE = 28, + BLAKE2S_256_HASH_SIZE = 32, +}; + +struct blake2s_state { + u32 h[8]; + u32 t[2]; + u32 f[2]; + u8 buf[BLAKE2S_BLOCK_SIZE]; + unsigned int buflen; + unsigned int outlen; +}; + +enum blake2s_iv { + BLAKE2S_IV0 = 0x6A09E667UL, + BLAKE2S_IV1 = 0xBB67AE85UL, + BLAKE2S_IV2 = 0x3C6EF372UL, + BLAKE2S_IV3 = 0xA54FF53AUL, + BLAKE2S_IV4 = 0x510E527FUL, + BLAKE2S_IV5 = 0x9B05688CUL, + BLAKE2S_IV6 = 0x1F83D9ABUL, + BLAKE2S_IV7 = 0x5BE0CD19UL, +}; + +void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen); +void blake2s_final(struct blake2s_state *state, u8 *out); + +static inline void blake2s_init_param(struct blake2s_state *state, + const u32 param) +{ + *state = (struct blake2s_state){{ + BLAKE2S_IV0 ^ param, + BLAKE2S_IV1, + BLAKE2S_IV2, + BLAKE2S_IV3, + BLAKE2S_IV4, + BLAKE2S_IV5, + BLAKE2S_IV6, + BLAKE2S_IV7, + }}; +} + +static inline void blake2s_init(struct blake2s_state *state, + const size_t outlen) +{ + blake2s_init_param(state, 0x01010000 | outlen); + state->outlen = outlen; +} + +static inline void blake2s_init_key(struct blake2s_state *state, + const size_t outlen, const void *key, + const size_t keylen) +{ + WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE || + !key || !keylen || keylen > BLAKE2S_KEY_SIZE)); + + blake2s_init_param(state, 0x01010000 | keylen << 8 | outlen); + memcpy(state->buf, key, keylen); + state->buflen = BLAKE2S_BLOCK_SIZE; + state->outlen = outlen; +} + +static inline void blake2s(u8 *out, const u8 *in, const u8 *key, + const size_t outlen, const size_t inlen, + const size_t keylen) +{ + struct blake2s_state state; + + WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen || + outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE || + (!key && keylen))); + + if (keylen) + blake2s_init_key(&state, outlen, key, keylen); + else + blake2s_init(&state, outlen); + + blake2s_update(&state, in, inlen); + blake2s_final(&state, out); +} + +void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen, + const size_t keylen); + +#endif /* BLAKE2S_H */ diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h new file mode 100644 index 000000000000..941693effc7d --- /dev/null +++ b/include/crypto/internal/blake2s.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ + +#ifndef BLAKE2S_INTERNAL_H +#define BLAKE2S_INTERNAL_H + +#include + +void blake2s_compress_generic(struct blake2s_state *state,const u8 *block, + size_t nblocks, const u32 inc); + +void blake2s_compress_arch(struct blake2s_state *state,const u8 *block, + size_t nblocks, const u32 inc); + +static inline void blake2s_set_lastblock(struct blake2s_state *state) +{ + state->f[0] = -1; +} + +#endif /* BLAKE2S_INTERNAL_H */ -- cgit v1.2.3 From 7f9b0880925f1f9d7d59504ea0892d2ae9cfc233 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Nov 2019 13:22:30 +0100 Subject: crypto: blake2s - implement generic shash driver Wire up our newly added Blake2s implementation via the shash API. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/internal/blake2s.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h index 941693effc7d..74ff77032e52 100644 --- a/include/crypto/internal/blake2s.h +++ b/include/crypto/internal/blake2s.h @@ -5,6 +5,11 @@ #include +struct blake2s_tfm_ctx { + u8 key[BLAKE2S_KEY_SIZE]; + unsigned int keylen; +}; + void blake2s_compress_generic(struct blake2s_state *state,const u8 *block, size_t nblocks, const u32 inc); -- cgit v1.2.3 From 0ed42a6f431e930b2e8fae21955406e09fe75d70 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 8 Nov 2019 13:22:32 +0100 Subject: crypto: curve25519 - generic C library implementations This contains two formally verified C implementations of the Curve25519 scalar multiplication function, one for 32-bit systems, and one for 64-bit systems whose compiler supports efficient 128-bit integer types. Not only are these implementations formally verified, but they are also the fastest available C implementations. They have been modified to be friendly to kernel space and to be generally less horrendous looking, but still an effort has been made to retain their formally verified characteristic, and so the C might look slightly unidiomatic. The 64-bit version comes from HACL*: https://github.com/project-everest/hacl-star The 32-bit version comes from Fiat: https://github.com/mit-plv/fiat-crypto Information: https://cr.yp.to/ecdh.html Signed-off-by: Jason A. Donenfeld [ardb: - move from lib/zinc to lib/crypto - replace .c #includes with Kconfig based object selection - drop simd handling and simplify support for per-arch versions ] Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/curve25519.h | 71 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 include/crypto/curve25519.h (limited to 'include') diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h new file mode 100644 index 000000000000..4e6dc840b159 --- /dev/null +++ b/include/crypto/curve25519.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + */ + +#ifndef CURVE25519_H +#define CURVE25519_H + +#include // For crypto_memneq. +#include +#include + +enum curve25519_lengths { + CURVE25519_KEY_SIZE = 32 +}; + +extern const u8 curve25519_null_point[]; +extern const u8 curve25519_base_point[]; + +void curve25519_generic(u8 out[CURVE25519_KEY_SIZE], + const u8 scalar[CURVE25519_KEY_SIZE], + const u8 point[CURVE25519_KEY_SIZE]); + +void curve25519_arch(u8 out[CURVE25519_KEY_SIZE], + const u8 scalar[CURVE25519_KEY_SIZE], + const u8 point[CURVE25519_KEY_SIZE]); + +void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]); + +static inline +bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) + curve25519_arch(mypublic, secret, basepoint); + else + curve25519_generic(mypublic, secret, basepoint); + return crypto_memneq(mypublic, curve25519_null_point, + CURVE25519_KEY_SIZE); +} + +static inline bool +__must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + if (unlikely(!crypto_memneq(secret, curve25519_null_point, + CURVE25519_KEY_SIZE))) + return false; + + if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519)) + curve25519_base_arch(pub, secret); + else + curve25519_generic(pub, secret, curve25519_base_point); + return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE); +} + +static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE]) +{ + secret[0] &= 248; + secret[31] = (secret[31] & 127) | 64; +} + +static inline void curve25519_generate_secret(u8 secret[CURVE25519_KEY_SIZE]) +{ + get_random_bytes_wait(secret, CURVE25519_KEY_SIZE); + curve25519_clamp_secret(secret); +} + +#endif /* CURVE25519_H */ -- cgit v1.2.3 From ed20078b7e3331e82828be357147af6a3282e4ce Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Nov 2019 13:22:39 +0100 Subject: crypto: chacha20poly1305 - import construction and selftest from Zinc This incorporates the chacha20poly1305 from the Zinc library, retaining the library interface, but replacing the implementation with calls into the code that already existed in the kernel's crypto API. Note that this library API does not implement RFC7539 fully, given that it is limited to 64-bit nonces. (The 96-bit nonce version that was part of the selftest only has been removed, along with the 96-bit nonce test vectors that only tested the selftest but not the actual library itself) Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/chacha20poly1305.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 include/crypto/chacha20poly1305.h (limited to 'include') diff --git a/include/crypto/chacha20poly1305.h b/include/crypto/chacha20poly1305.h new file mode 100644 index 000000000000..ad3b1de58df8 --- /dev/null +++ b/include/crypto/chacha20poly1305.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + */ + +#ifndef __CHACHA20POLY1305_H +#define __CHACHA20POLY1305_H + +#include + +enum chacha20poly1305_lengths { + XCHACHA20POLY1305_NONCE_SIZE = 24, + CHACHA20POLY1305_KEY_SIZE = 32, + CHACHA20POLY1305_AUTHTAG_SIZE = 16 +}; + +void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, + const u8 *ad, const size_t ad_len, + const u64 nonce, + const u8 key[CHACHA20POLY1305_KEY_SIZE]); + +bool __must_check +chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, + const u8 *ad, const size_t ad_len, const u64 nonce, + const u8 key[CHACHA20POLY1305_KEY_SIZE]); + +void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, + const u8 *ad, const size_t ad_len, + const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE], + const u8 key[CHACHA20POLY1305_KEY_SIZE]); + +bool __must_check xchacha20poly1305_decrypt( + u8 *dst, const u8 *src, const size_t src_len, const u8 *ad, + const size_t ad_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE], + const u8 key[CHACHA20POLY1305_KEY_SIZE]); + +#endif /* __CHACHA20POLY1305_H */ -- cgit v1.2.3 From d95312a3ccc0cd544d374be2fc45aeaa803e5fd9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Nov 2019 13:22:40 +0100 Subject: crypto: lib/chacha20poly1305 - reimplement crypt_from_sg() routine Reimplement the library routines to perform chacha20poly1305 en/decryption on scatterlists, without [ab]using the [deprecated] blkcipher interface, which is rather heavyweight and does things we don't really need. Instead, we use the sg_miter API in a novel and clever way, to iterate over the scatterlist in-place (i.e., source == destination, which is the only way this library is expected to be used). That way, we don't have to iterate over two scatterlists in parallel. Another optimization is that, instead of relying on the blkcipher walker to present the input in suitable chunks, we recognize that ChaCha is a streamcipher, and so we can simply deal with partial blocks by keeping a block of cipherstream on the stack and use crypto_xor() to mix it with the in/output. Finally, we omit the scatterwalk_and_copy() call if the last element of the scatterlist covers the MAC as well (which is the common case), avoiding the need to walk the scatterlist and kmap() the page twice. Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/chacha20poly1305.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/crypto/chacha20poly1305.h b/include/crypto/chacha20poly1305.h index ad3b1de58df8..234ee28078ef 100644 --- a/include/crypto/chacha20poly1305.h +++ b/include/crypto/chacha20poly1305.h @@ -7,6 +7,7 @@ #define __CHACHA20POLY1305_H #include +#include enum chacha20poly1305_lengths { XCHACHA20POLY1305_NONCE_SIZE = 24, @@ -34,4 +35,14 @@ bool __must_check xchacha20poly1305_decrypt( const size_t ad_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE], const u8 key[CHACHA20POLY1305_KEY_SIZE]); +bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len, + const u8 *ad, const size_t ad_len, + const u64 nonce, + const u8 key[CHACHA20POLY1305_KEY_SIZE]); + +bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len, + const u8 *ad, const size_t ad_len, + const u64 nonce, + const u8 key[CHACHA20POLY1305_KEY_SIZE]); + #endif /* __CHACHA20POLY1305_H */ -- cgit v1.2.3 From d63007eb954e425f45766482813738f41db9af45 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 9 Nov 2019 18:09:53 +0100 Subject: crypto: ablkcipher - remove deprecated and unused ablkcipher support Now that all users of the deprecated ablkcipher interface have been moved to the skcipher interface, ablkcipher is no longer used and can be removed. Reviewed-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/aead.h | 2 +- include/crypto/algapi.h | 75 ------- include/crypto/engine.h | 4 - include/crypto/hash.h | 2 +- include/crypto/internal/des.h | 12 - include/crypto/internal/skcipher.h | 20 -- include/crypto/skcipher.h | 6 - include/linux/crypto.h | 435 ------------------------------------- 8 files changed, 2 insertions(+), 554 deletions(-) (limited to 'include') diff --git a/include/crypto/aead.h b/include/crypto/aead.h index 3c245b1859e7..a3bdadf6221e 100644 --- a/include/crypto/aead.h +++ b/include/crypto/aead.h @@ -321,7 +321,7 @@ int crypto_aead_encrypt(struct aead_request *req); /** * crypto_aead_decrypt() - decrypt ciphertext - * @req: reference to the ablkcipher_request handle that holds all information + * @req: reference to the aead_request handle that holds all information * needed to perform the cipher operation * * Decrypt ciphertext data using the aead_request handle. That data structure diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index cadc5257c612..5cd846defdd6 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -85,25 +85,6 @@ struct scatter_walk { unsigned int offset; }; -struct ablkcipher_walk { - struct { - struct page *page; - unsigned int offset; - } src, dst; - - struct scatter_walk in; - unsigned int nbytes; - struct scatter_walk out; - unsigned int total; - struct list_head buffers; - u8 *iv_buffer; - u8 *iv; - int flags; - unsigned int blocksize; -}; - -extern const struct crypto_type crypto_ablkcipher_type; - void crypto_mod_put(struct crypto_alg *alg); int crypto_register_template(struct crypto_template *tmpl); @@ -202,12 +183,6 @@ static inline void crypto_xor_cpy(u8 *dst, const u8 *src1, const u8 *src2, } } -int ablkcipher_walk_done(struct ablkcipher_request *req, - struct ablkcipher_walk *walk, int err); -int ablkcipher_walk_phys(struct ablkcipher_request *req, - struct ablkcipher_walk *walk); -void __ablkcipher_walk_complete(struct ablkcipher_walk *walk); - static inline void *crypto_tfm_ctx_aligned(struct crypto_tfm *tfm) { return PTR_ALIGN(crypto_tfm_ctx(tfm), @@ -225,22 +200,6 @@ static inline void *crypto_instance_ctx(struct crypto_instance *inst) return inst->__ctx; } -static inline struct ablkcipher_alg *crypto_ablkcipher_alg( - struct crypto_ablkcipher *tfm) -{ - return &crypto_ablkcipher_tfm(tfm)->__crt_alg->cra_ablkcipher; -} - -static inline void *crypto_ablkcipher_ctx(struct crypto_ablkcipher *tfm) -{ - return crypto_tfm_ctx(&tfm->base); -} - -static inline void *crypto_ablkcipher_ctx_aligned(struct crypto_ablkcipher *tfm) -{ - return crypto_tfm_ctx_aligned(&tfm->base); -} - static inline struct crypto_cipher *crypto_spawn_cipher( struct crypto_spawn *spawn) { @@ -255,23 +214,6 @@ static inline struct cipher_alg *crypto_cipher_alg(struct crypto_cipher *tfm) return &crypto_cipher_tfm(tfm)->__crt_alg->cra_cipher; } -static inline void ablkcipher_walk_init(struct ablkcipher_walk *walk, - struct scatterlist *dst, - struct scatterlist *src, - unsigned int nbytes) -{ - walk->in.sg = src; - walk->out.sg = dst; - walk->total = nbytes; - INIT_LIST_HEAD(&walk->buffers); -} - -static inline void ablkcipher_walk_complete(struct ablkcipher_walk *walk) -{ - if (unlikely(!list_empty(&walk->buffers))) - __ablkcipher_walk_complete(walk); -} - static inline struct crypto_async_request *crypto_get_backlog( struct crypto_queue *queue) { @@ -279,23 +221,6 @@ static inline struct crypto_async_request *crypto_get_backlog( container_of(queue->backlog, struct crypto_async_request, list); } -static inline int ablkcipher_enqueue_request(struct crypto_queue *queue, - struct ablkcipher_request *request) -{ - return crypto_enqueue_request(queue, &request->base); -} - -static inline struct ablkcipher_request *ablkcipher_dequeue_request( - struct crypto_queue *queue) -{ - return ablkcipher_request_cast(crypto_dequeue_request(queue)); -} - -static inline void *ablkcipher_request_ctx(struct ablkcipher_request *req) -{ - return req->__ctx; -} - static inline struct crypto_alg *crypto_get_attr_alg(struct rtattr **tb, u32 type, u32 mask) { diff --git a/include/crypto/engine.h b/include/crypto/engine.h index 84c708bba00b..e29cd67f93c7 100644 --- a/include/crypto/engine.h +++ b/include/crypto/engine.h @@ -83,8 +83,6 @@ struct crypto_engine_ctx { struct crypto_engine_op op; }; -int crypto_transfer_ablkcipher_request_to_engine(struct crypto_engine *engine, - struct ablkcipher_request *req); int crypto_transfer_aead_request_to_engine(struct crypto_engine *engine, struct aead_request *req); int crypto_transfer_akcipher_request_to_engine(struct crypto_engine *engine, @@ -93,8 +91,6 @@ int crypto_transfer_hash_request_to_engine(struct crypto_engine *engine, struct ahash_request *req); int crypto_transfer_skcipher_request_to_engine(struct crypto_engine *engine, struct skcipher_request *req); -void crypto_finalize_ablkcipher_request(struct crypto_engine *engine, - struct ablkcipher_request *req, int err); void crypto_finalize_aead_request(struct crypto_engine *engine, struct aead_request *req, int err); void crypto_finalize_akcipher_request(struct crypto_engine *engine, diff --git a/include/crypto/hash.h b/include/crypto/hash.h index d52b95b75ae4..fe7f73bad1e2 100644 --- a/include/crypto/hash.h +++ b/include/crypto/hash.h @@ -227,7 +227,7 @@ struct crypto_shash { * CRYPTO_ALG_TYPE_AHASH (listed as type "ahash" in /proc/crypto) * * The asynchronous cipher operation discussion provided for the - * CRYPTO_ALG_TYPE_ABLKCIPHER API applies here as well. + * CRYPTO_ALG_TYPE_SKCIPHER API applies here as well. */ static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm) diff --git a/include/crypto/internal/des.h b/include/crypto/internal/des.h index 81ea1a425e9c..f62a2bb1866b 100644 --- a/include/crypto/internal/des.h +++ b/include/crypto/internal/des.h @@ -117,18 +117,6 @@ static inline int verify_skcipher_des3_key(struct crypto_skcipher *tfm, return crypto_des3_ede_verify_key(crypto_skcipher_tfm(tfm), key); } -static inline int verify_ablkcipher_des_key(struct crypto_ablkcipher *tfm, - const u8 *key) -{ - return crypto_des_verify_key(crypto_ablkcipher_tfm(tfm), key); -} - -static inline int verify_ablkcipher_des3_key(struct crypto_ablkcipher *tfm, - const u8 *key) -{ - return crypto_des3_ede_verify_key(crypto_ablkcipher_tfm(tfm), key); -} - static inline int verify_aead_des_key(struct crypto_aead *tfm, const u8 *key, int keylen) { diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index 454e898d5f5f..921c409fe1b1 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -153,17 +153,6 @@ static inline void skcipher_walk_abort(struct skcipher_walk *walk) skcipher_walk_done(walk, -ECANCELED); } -static inline void ablkcipher_request_complete(struct ablkcipher_request *req, - int err) -{ - req->base.complete(&req->base, err); -} - -static inline u32 ablkcipher_request_flags(struct ablkcipher_request *req) -{ - return req->base.flags; -} - static inline void *crypto_skcipher_ctx(struct crypto_skcipher *tfm) { return crypto_tfm_ctx(&tfm->base); @@ -182,27 +171,18 @@ static inline u32 skcipher_request_flags(struct skcipher_request *req) static inline unsigned int crypto_skcipher_alg_min_keysize( struct skcipher_alg *alg) { - if (alg->base.cra_ablkcipher.encrypt) - return alg->base.cra_ablkcipher.min_keysize; - return alg->min_keysize; } static inline unsigned int crypto_skcipher_alg_max_keysize( struct skcipher_alg *alg) { - if (alg->base.cra_ablkcipher.encrypt) - return alg->base.cra_ablkcipher.max_keysize; - return alg->max_keysize; } static inline unsigned int crypto_skcipher_alg_walksize( struct skcipher_alg *alg) { - if (alg->base.cra_ablkcipher.encrypt) - return alg->base.cra_blocksize; - return alg->walksize; } diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index 8c5a31e810da..b4655d91661f 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -241,9 +241,6 @@ static inline struct skcipher_alg *crypto_skcipher_alg( static inline unsigned int crypto_skcipher_alg_ivsize(struct skcipher_alg *alg) { - if (alg->base.cra_ablkcipher.encrypt) - return alg->base.cra_ablkcipher.ivsize; - return alg->ivsize; } @@ -286,9 +283,6 @@ static inline unsigned int crypto_skcipher_blocksize( static inline unsigned int crypto_skcipher_alg_chunksize( struct skcipher_alg *alg) { - if (alg->base.cra_ablkcipher.encrypt) - return alg->base.cra_blocksize; - return alg->chunksize; } diff --git a/include/linux/crypto.h b/include/linux/crypto.h index e9f2c6b5d800..23365a9d062e 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -41,7 +41,6 @@ #define CRYPTO_ALG_TYPE_CIPHER 0x00000001 #define CRYPTO_ALG_TYPE_COMPRESS 0x00000002 #define CRYPTO_ALG_TYPE_AEAD 0x00000003 -#define CRYPTO_ALG_TYPE_ABLKCIPHER 0x00000005 #define CRYPTO_ALG_TYPE_SKCIPHER 0x00000005 #define CRYPTO_ALG_TYPE_KPP 0x00000008 #define CRYPTO_ALG_TYPE_ACOMPRESS 0x0000000a @@ -137,7 +136,6 @@ #define CRYPTO_MINALIGN_ATTR __attribute__ ((__aligned__(CRYPTO_MINALIGN))) struct scatterlist; -struct crypto_ablkcipher; struct crypto_async_request; struct crypto_tfm; struct crypto_type; @@ -160,19 +158,6 @@ struct crypto_async_request { u32 flags; }; -struct ablkcipher_request { - struct crypto_async_request base; - - unsigned int nbytes; - - void *info; - - struct scatterlist *src; - struct scatterlist *dst; - - void *__ctx[] CRYPTO_MINALIGN_ATTR; -}; - /** * DOC: Block Cipher Algorithm Definitions * @@ -180,57 +165,6 @@ struct ablkcipher_request { * managed via crypto_register_alg() and crypto_unregister_alg(). */ -/** - * struct ablkcipher_alg - asynchronous block cipher definition - * @min_keysize: Minimum key size supported by the transformation. This is the - * smallest key length supported by this transformation algorithm. - * This must be set to one of the pre-defined values as this is - * not hardware specific. Possible values for this field can be - * found via git grep "_MIN_KEY_SIZE" include/crypto/ - * @max_keysize: Maximum key size supported by the transformation. This is the - * largest key length supported by this transformation algorithm. - * This must be set to one of the pre-defined values as this is - * not hardware specific. Possible values for this field can be - * found via git grep "_MAX_KEY_SIZE" include/crypto/ - * @setkey: Set key for the transformation. This function is used to either - * program a supplied key into the hardware or store the key in the - * transformation context for programming it later. Note that this - * function does modify the transformation context. This function can - * be called multiple times during the existence of the transformation - * object, so one must make sure the key is properly reprogrammed into - * the hardware. This function is also responsible for checking the key - * length for validity. In case a software fallback was put in place in - * the @cra_init call, this function might need to use the fallback if - * the algorithm doesn't support all of the key sizes. - * @encrypt: Encrypt a scatterlist of blocks. This function is used to encrypt - * the supplied scatterlist containing the blocks of data. The crypto - * API consumer is responsible for aligning the entries of the - * scatterlist properly and making sure the chunks are correctly - * sized. In case a software fallback was put in place in the - * @cra_init call, this function might need to use the fallback if - * the algorithm doesn't support all of the key sizes. In case the - * key was stored in transformation context, the key might need to be - * re-programmed into the hardware in this function. This function - * shall not modify the transformation context, as this function may - * be called in parallel with the same transformation object. - * @decrypt: Decrypt a single block. This is a reverse counterpart to @encrypt - * and the conditions are exactly the same. - * @ivsize: IV size applicable for transformation. The consumer must provide an - * IV of exactly that size to perform the encrypt or decrypt operation. - * - * All fields except @ivsize are mandatory and must be filled. - */ -struct ablkcipher_alg { - int (*setkey)(struct crypto_ablkcipher *tfm, const u8 *key, - unsigned int keylen); - int (*encrypt)(struct ablkcipher_request *req); - int (*decrypt)(struct ablkcipher_request *req); - - unsigned int min_keysize; - unsigned int max_keysize; - unsigned int ivsize; -}; - /** * struct cipher_alg - single-block symmetric ciphers definition * @cia_min_keysize: Minimum key size supported by the transformation. This is @@ -415,7 +349,6 @@ struct crypto_istat_rng { }; #endif /* CONFIG_CRYPTO_STATS */ -#define cra_ablkcipher cra_u.ablkcipher #define cra_cipher cra_u.cipher #define cra_compress cra_u.compress @@ -483,8 +416,6 @@ struct crypto_istat_rng { * @cra_exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @cra_init, used to remove various changes set in * @cra_init. - * @cra_u.ablkcipher: Union member which contains an asynchronous block cipher - * definition. See @struct @ablkcipher_alg. * @cra_u.cipher: Union member which contains a single-block symmetric cipher * definition. See @struct @cipher_alg. * @cra_u.compress: Union member which contains a (de)compression algorithm. @@ -526,7 +457,6 @@ struct crypto_alg { const struct crypto_type *cra_type; union { - struct ablkcipher_alg ablkcipher; struct cipher_alg cipher; struct compress_alg compress; } cra_u; @@ -554,8 +484,6 @@ struct crypto_alg { #ifdef CONFIG_CRYPTO_STATS void crypto_stats_init(struct crypto_alg *alg); void crypto_stats_get(struct crypto_alg *alg); -void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg); -void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg); void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret); void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret); void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg); @@ -578,10 +506,6 @@ static inline void crypto_stats_init(struct crypto_alg *alg) {} static inline void crypto_stats_get(struct crypto_alg *alg) {} -static inline void crypto_stats_ablkcipher_encrypt(unsigned int nbytes, int ret, struct crypto_alg *alg) -{} -static inline void crypto_stats_ablkcipher_decrypt(unsigned int nbytes, int ret, struct crypto_alg *alg) -{} static inline void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret) {} static inline void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret) @@ -675,18 +599,6 @@ int crypto_has_alg(const char *name, u32 type, u32 mask); * crypto_free_*(), as well as the various helpers below. */ -struct ablkcipher_tfm { - int (*setkey)(struct crypto_ablkcipher *tfm, const u8 *key, - unsigned int keylen); - int (*encrypt)(struct ablkcipher_request *req); - int (*decrypt)(struct ablkcipher_request *req); - - struct crypto_ablkcipher *base; - - unsigned int ivsize; - unsigned int reqsize; -}; - struct cipher_tfm { int (*cit_setkey)(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen); @@ -703,7 +615,6 @@ struct compress_tfm { u8 *dst, unsigned int *dlen); }; -#define crt_ablkcipher crt_u.ablkcipher #define crt_cipher crt_u.cipher #define crt_compress crt_u.compress @@ -712,7 +623,6 @@ struct crypto_tfm { u32 crt_flags; union { - struct ablkcipher_tfm ablkcipher; struct cipher_tfm cipher; struct compress_tfm compress; } crt_u; @@ -724,10 +634,6 @@ struct crypto_tfm { void *__crt_ctx[] CRYPTO_MINALIGN_ATTR; }; -struct crypto_ablkcipher { - struct crypto_tfm base; -}; - struct crypto_cipher { struct crypto_tfm base; }; @@ -835,347 +741,6 @@ static inline unsigned int crypto_tfm_ctx_alignment(void) return __alignof__(tfm->__crt_ctx); } -/* - * API wrappers. - */ -static inline struct crypto_ablkcipher *__crypto_ablkcipher_cast( - struct crypto_tfm *tfm) -{ - return (struct crypto_ablkcipher *)tfm; -} - -/** - * DOC: Asynchronous Block Cipher API - * - * Asynchronous block cipher API is used with the ciphers of type - * CRYPTO_ALG_TYPE_ABLKCIPHER (listed as type "ablkcipher" in /proc/crypto). - * - * Asynchronous cipher operations imply that the function invocation for a - * cipher request returns immediately before the completion of the operation. - * The cipher request is scheduled as a separate kernel thread and therefore - * load-balanced on the different CPUs via the process scheduler. To allow - * the kernel crypto API to inform the caller about the completion of a cipher - * request, the caller must provide a callback function. That function is - * invoked with the cipher handle when the request completes. - * - * To support the asynchronous operation, additional information than just the - * cipher handle must be supplied to the kernel crypto API. That additional - * information is given by filling in the ablkcipher_request data structure. - * - * For the asynchronous block cipher API, the state is maintained with the tfm - * cipher handle. A single tfm can be used across multiple calls and in - * parallel. For asynchronous block cipher calls, context data supplied and - * only used by the caller can be referenced the request data structure in - * addition to the IV used for the cipher request. The maintenance of such - * state information would be important for a crypto driver implementer to - * have, because when calling the callback function upon completion of the - * cipher operation, that callback function may need some information about - * which operation just finished if it invoked multiple in parallel. This - * state information is unused by the kernel crypto API. - */ - -static inline struct crypto_tfm *crypto_ablkcipher_tfm( - struct crypto_ablkcipher *tfm) -{ - return &tfm->base; -} - -/** - * crypto_free_ablkcipher() - zeroize and free cipher handle - * @tfm: cipher handle to be freed - */ -static inline void crypto_free_ablkcipher(struct crypto_ablkcipher *tfm) -{ - crypto_free_tfm(crypto_ablkcipher_tfm(tfm)); -} - -static inline struct ablkcipher_tfm *crypto_ablkcipher_crt( - struct crypto_ablkcipher *tfm) -{ - return &crypto_ablkcipher_tfm(tfm)->crt_ablkcipher; -} - -/** - * crypto_ablkcipher_ivsize() - obtain IV size - * @tfm: cipher handle - * - * The size of the IV for the ablkcipher referenced by the cipher handle is - * returned. This IV size may be zero if the cipher does not need an IV. - * - * Return: IV size in bytes - */ -static inline unsigned int crypto_ablkcipher_ivsize( - struct crypto_ablkcipher *tfm) -{ - return crypto_ablkcipher_crt(tfm)->ivsize; -} - -/** - * crypto_ablkcipher_blocksize() - obtain block size of cipher - * @tfm: cipher handle - * - * The block size for the ablkcipher referenced with the cipher handle is - * returned. The caller may use that information to allocate appropriate - * memory for the data returned by the encryption or decryption operation - * - * Return: block size of cipher - */ -static inline unsigned int crypto_ablkcipher_blocksize( - struct crypto_ablkcipher *tfm) -{ - return crypto_tfm_alg_blocksize(crypto_ablkcipher_tfm(tfm)); -} - -static inline unsigned int crypto_ablkcipher_alignmask( - struct crypto_ablkcipher *tfm) -{ - return crypto_tfm_alg_alignmask(crypto_ablkcipher_tfm(tfm)); -} - -static inline u32 crypto_ablkcipher_get_flags(struct crypto_ablkcipher *tfm) -{ - return crypto_tfm_get_flags(crypto_ablkcipher_tfm(tfm)); -} - -static inline void crypto_ablkcipher_set_flags(struct crypto_ablkcipher *tfm, - u32 flags) -{ - crypto_tfm_set_flags(crypto_ablkcipher_tfm(tfm), flags); -} - -static inline void crypto_ablkcipher_clear_flags(struct crypto_ablkcipher *tfm, - u32 flags) -{ - crypto_tfm_clear_flags(crypto_ablkcipher_tfm(tfm), flags); -} - -/** - * crypto_ablkcipher_setkey() - set key for cipher - * @tfm: cipher handle - * @key: buffer holding the key - * @keylen: length of the key in bytes - * - * The caller provided key is set for the ablkcipher referenced by the cipher - * handle. - * - * Note, the key length determines the cipher type. Many block ciphers implement - * different cipher modes depending on the key size, such as AES-128 vs AES-192 - * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 - * is performed. - * - * Return: 0 if the setting of the key was successful; < 0 if an error occurred - */ -static inline int crypto_ablkcipher_setkey(struct crypto_ablkcipher *tfm, - const u8 *key, unsigned int keylen) -{ - struct ablkcipher_tfm *crt = crypto_ablkcipher_crt(tfm); - - return crt->setkey(crt->base, key, keylen); -} - -/** - * crypto_ablkcipher_reqtfm() - obtain cipher handle from request - * @req: ablkcipher_request out of which the cipher handle is to be obtained - * - * Return the crypto_ablkcipher handle when furnishing an ablkcipher_request - * data structure. - * - * Return: crypto_ablkcipher handle - */ -static inline struct crypto_ablkcipher *crypto_ablkcipher_reqtfm( - struct ablkcipher_request *req) -{ - return __crypto_ablkcipher_cast(req->base.tfm); -} - -/** - * crypto_ablkcipher_encrypt() - encrypt plaintext - * @req: reference to the ablkcipher_request handle that holds all information - * needed to perform the cipher operation - * - * Encrypt plaintext data using the ablkcipher_request handle. That data - * structure and how it is filled with data is discussed with the - * ablkcipher_request_* functions. - * - * Return: 0 if the cipher operation was successful; < 0 if an error occurred - */ -static inline int crypto_ablkcipher_encrypt(struct ablkcipher_request *req) -{ - struct ablkcipher_tfm *crt = - crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req)); - struct crypto_alg *alg = crt->base->base.__crt_alg; - unsigned int nbytes = req->nbytes; - int ret; - - crypto_stats_get(alg); - ret = crt->encrypt(req); - crypto_stats_ablkcipher_encrypt(nbytes, ret, alg); - return ret; -} - -/** - * crypto_ablkcipher_decrypt() - decrypt ciphertext - * @req: reference to the ablkcipher_request handle that holds all information - * needed to perform the cipher operation - * - * Decrypt ciphertext data using the ablkcipher_request handle. That data - * structure and how it is filled with data is discussed with the - * ablkcipher_request_* functions. - * - * Return: 0 if the cipher operation was successful; < 0 if an error occurred - */ -static inline int crypto_ablkcipher_decrypt(struct ablkcipher_request *req) -{ - struct ablkcipher_tfm *crt = - crypto_ablkcipher_crt(crypto_ablkcipher_reqtfm(req)); - struct crypto_alg *alg = crt->base->base.__crt_alg; - unsigned int nbytes = req->nbytes; - int ret; - - crypto_stats_get(alg); - ret = crt->decrypt(req); - crypto_stats_ablkcipher_decrypt(nbytes, ret, alg); - return ret; -} - -/** - * DOC: Asynchronous Cipher Request Handle - * - * The ablkcipher_request data structure contains all pointers to data - * required for the asynchronous cipher operation. This includes the cipher - * handle (which can be used by multiple ablkcipher_request instances), pointer - * to plaintext and ciphertext, asynchronous callback function, etc. It acts - * as a handle to the ablkcipher_request_* API calls in a similar way as - * ablkcipher handle to the crypto_ablkcipher_* API calls. - */ - -/** - * crypto_ablkcipher_reqsize() - obtain size of the request data structure - * @tfm: cipher handle - * - * Return: number of bytes - */ -static inline unsigned int crypto_ablkcipher_reqsize( - struct crypto_ablkcipher *tfm) -{ - return crypto_ablkcipher_crt(tfm)->reqsize; -} - -/** - * ablkcipher_request_set_tfm() - update cipher handle reference in request - * @req: request handle to be modified - * @tfm: cipher handle that shall be added to the request handle - * - * Allow the caller to replace the existing ablkcipher handle in the request - * data structure with a different one. - */ -static inline void ablkcipher_request_set_tfm( - struct ablkcipher_request *req, struct crypto_ablkcipher *tfm) -{ - req->base.tfm = crypto_ablkcipher_tfm(crypto_ablkcipher_crt(tfm)->base); -} - -static inline struct ablkcipher_request *ablkcipher_request_cast( - struct crypto_async_request *req) -{ - return container_of(req, struct ablkcipher_request, base); -} - -/** - * ablkcipher_request_alloc() - allocate request data structure - * @tfm: cipher handle to be registered with the request - * @gfp: memory allocation flag that is handed to kmalloc by the API call. - * - * Allocate the request data structure that must be used with the ablkcipher - * encrypt and decrypt API calls. During the allocation, the provided ablkcipher - * handle is registered in the request data structure. - * - * Return: allocated request handle in case of success, or NULL if out of memory - */ -static inline struct ablkcipher_request *ablkcipher_request_alloc( - struct crypto_ablkcipher *tfm, gfp_t gfp) -{ - struct ablkcipher_request *req; - - req = kmalloc(sizeof(struct ablkcipher_request) + - crypto_ablkcipher_reqsize(tfm), gfp); - - if (likely(req)) - ablkcipher_request_set_tfm(req, tfm); - - return req; -} - -/** - * ablkcipher_request_free() - zeroize and free request data structure - * @req: request data structure cipher handle to be freed - */ -static inline void ablkcipher_request_free(struct ablkcipher_request *req) -{ - kzfree(req); -} - -/** - * ablkcipher_request_set_callback() - set asynchronous callback function - * @req: request handle - * @flags: specify zero or an ORing of the flags - * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and - * increase the wait queue beyond the initial maximum size; - * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep - * @compl: callback function pointer to be registered with the request handle - * @data: The data pointer refers to memory that is not used by the kernel - * crypto API, but provided to the callback function for it to use. Here, - * the caller can provide a reference to memory the callback function can - * operate on. As the callback function is invoked asynchronously to the - * related functionality, it may need to access data structures of the - * related functionality which can be referenced using this pointer. The - * callback function can access the memory via the "data" field in the - * crypto_async_request data structure provided to the callback function. - * - * This function allows setting the callback function that is triggered once the - * cipher operation completes. - * - * The callback function is registered with the ablkcipher_request handle and - * must comply with the following template:: - * - * void callback_function(struct crypto_async_request *req, int error) - */ -static inline void ablkcipher_request_set_callback( - struct ablkcipher_request *req, - u32 flags, crypto_completion_t compl, void *data) -{ - req->base.complete = compl; - req->base.data = data; - req->base.flags = flags; -} - -/** - * ablkcipher_request_set_crypt() - set data buffers - * @req: request handle - * @src: source scatter / gather list - * @dst: destination scatter / gather list - * @nbytes: number of bytes to process from @src - * @iv: IV for the cipher operation which must comply with the IV size defined - * by crypto_ablkcipher_ivsize - * - * This function allows setting of the source data and destination data - * scatter / gather lists. - * - * For encryption, the source is treated as the plaintext and the - * destination is the ciphertext. For a decryption operation, the use is - * reversed - the source is the ciphertext and the destination is the plaintext. - */ -static inline void ablkcipher_request_set_crypt( - struct ablkcipher_request *req, - struct scatterlist *src, struct scatterlist *dst, - unsigned int nbytes, void *iv) -{ - req->src = src; - req->dst = dst; - req->nbytes = nbytes; - req->info = iv; -} - /** * DOC: Single Block Cipher API * -- cgit v1.2.3 From 1e0bd5a091e5d9e0f1d5b0e6329b87bb1792f784 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:02 -0800 Subject: bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never fails 92117d8443bc ("bpf: fix refcnt overflow") turned refcounting of bpf_map into potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit (32k). Due to using 32-bit counter, it's possible in practice to overflow refcounter and make it wrap around to 0, causing erroneous map free, while there are still references to it, causing use-after-free problems. But having a failing refcounting operations are problematic in some cases. One example is mmap() interface. After establishing initial memory-mapping, user is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily splitting it into multiple non-contiguous regions. All this happening without any control from the users of mmap subsystem. Rather mmap subsystem sends notifications to original creator of memory mapping through open/close callbacks, which are optionally specified during initial memory mapping creation. These callbacks are used to maintain accurate refcount for bpf_map (see next patch in this series). The problem is that open() callback is not supposed to fail, because memory-mapped resource is set up and properly referenced. This is posing a problem for using memory-mapping with BPF maps. One solution to this is to maintain separate refcount for just memory-mappings and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively. There are similar use cases in current work on tcp-bpf, necessitating extra counter as well. This seems like a rather unfortunate and ugly solution that doesn't scale well to various new use cases. Another approach to solve this is to use non-failing refcount_t type, which uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX, stays there. This utlimately causes memory leak, but prevents use after free. But given refcounting is not the most performance-critical operation with BPF maps (it's not used from running BPF program code), we can also just switch to 64-bit counter that can't overflow in practice, potentially disadvantaging 32-bit platforms a tiny bit. This simplifies semantics and allows above described scenarios to not worry about failing refcount increment operation. In terms of struct bpf_map size, we are still good and use the same amount of space: BEFORE (3 cache lines, 8 bytes of padding at the end): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic_t refcnt __attribute__((__aligned__(64))); /* 128 4 */ atomic_t usercnt; /* 132 4 */ struct work_struct work; /* 136 32 */ char name[16]; /* 168 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 146, holes: 1, sum holes: 38 */ /* padding: 8 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); AFTER (same 3 cache lines, no extra padding now): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */ atomic64_t usercnt; /* 136 8 */ struct work_struct work; /* 144 32 */ char name[16]; /* 176 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 154, holes: 1, sum holes: 38 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); This patch, while modifying all users of bpf_map_inc, also cleans up its interface to match bpf_map_put with separate operations for bpf_map_inc and bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref, respectively). Also, given there are no users of bpf_map_inc_not_zero specifying uref=true, remove uref flag and default to uref=false internally. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com --- include/linux/bpf.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b81cde47314..34a34445c009 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -103,8 +103,8 @@ struct bpf_map { /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ - atomic_t refcnt ____cacheline_aligned; - atomic_t usercnt; + atomic64_t refcnt ____cacheline_aligned; + atomic64_t usercnt; struct work_struct work; char name[BPF_OBJ_NAME_LEN]; }; @@ -783,9 +783,9 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); -struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); -struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map, - bool uref); +void bpf_map_inc(struct bpf_map *map); +void bpf_map_inc_with_uref(struct bpf_map *map); +struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); -- cgit v1.2.3 From 85192dbf4de08795afe2b88e52a36fc6abfc3dba Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:03 -0800 Subject: bpf: Convert bpf_prog refcnt to atomic64_t Similarly to bpf_map's refcnt/usercnt, convert bpf_prog's refcnt to atomic64 and remove artificial 32k limit. This allows to make bpf_prog's refcounting non-failing, simplifying logic of users of bpf_prog_add/bpf_prog_inc. Validated compilation by running allyesconfig kernel build. Suggested-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191117172806.2195367-3-andriin@fb.com --- include/linux/bpf.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 34a34445c009..fb606dc61a3a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -485,7 +485,7 @@ struct bpf_func_info_aux { }; struct bpf_prog_aux { - atomic_t refcnt; + atomic64_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; u32 max_pkt_offset; @@ -770,9 +770,9 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv); -struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); +void bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); -struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog); +void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); int __bpf_prog_charge(struct user_struct *user, u32 pages); @@ -912,10 +912,8 @@ static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, return ERR_PTR(-EOPNOTSUPP); } -static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, - int i) +static inline void bpf_prog_add(struct bpf_prog *prog, int i) { - return ERR_PTR(-EOPNOTSUPP); } static inline void bpf_prog_sub(struct bpf_prog *prog, int i) @@ -926,9 +924,8 @@ static inline void bpf_prog_put(struct bpf_prog *prog) { } -static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog) +static inline void bpf_prog_inc(struct bpf_prog *prog) { - return ERR_PTR(-EOPNOTSUPP); } static inline struct bpf_prog *__must_check -- cgit v1.2.3 From fc9702273e2edb90400a34b3be76f7b08fa3344b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:04 -0800 Subject: bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY Add ability to memory-map contents of BPF array map. This is extremely useful for working with BPF global data from userspace programs. It allows to avoid typical bpf_map_{lookup,update}_elem operations, improving both performance and usability. There had to be special considerations for map freezing, to avoid having writable memory view into a frozen map. To solve this issue, map freezing and mmap-ing is happening under mutex now: - if map is already frozen, no writable mapping is allowed; - if map has writable memory mappings active (accounted in map->writecnt), map freezing will keep failing with -EBUSY; - once number of writable memory mappings drops to zero, map freezing can be performed again. Only non-per-CPU plain arrays are supported right now. Maps with spinlocks can't be memory mapped either. For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc() to be mmap()'able. We also need to make sure that array data memory is page-sized and page-aligned, so we over-allocate memory in such a way that struct bpf_array is at the end of a single page of memory with array->value being aligned with the start of the second page. On deallocation we need to accomodate this memory arrangement to free vmalloc()'ed memory correctly. One important consideration regarding how memory-mapping subsystem functions. Memory-mapping subsystem provides few optional callbacks, among them open() and close(). close() is called for each memory region that is unmapped, so that users can decrease their reference counters and free up resources, if necessary. open() is *almost* symmetrical: it's called for each memory region that is being mapped, **except** the very first one. So bpf_map_mmap does initial refcnt bump, while open() will do any extra ones after that. Thus number of close() calls is equal to number of open() calls plus one more. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: John Fastabend Acked-by: Johannes Weiner Link: https://lore.kernel.org/bpf/20191117172806.2195367-4-andriin@fb.com --- include/linux/bpf.h | 11 ++++++++--- include/linux/vmalloc.h | 1 + include/uapi/linux/bpf.h | 3 +++ 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fb606dc61a3a..e913dd5946ae 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -68,6 +69,7 @@ struct bpf_map_ops { u64 *imm, u32 off); int (*map_direct_value_meta)(const struct bpf_map *map, u64 imm, u32 *off); + int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); }; struct bpf_map_memory { @@ -96,9 +98,10 @@ struct bpf_map { u32 btf_value_type_id; struct btf *btf; struct bpf_map_memory memory; + char name[BPF_OBJ_NAME_LEN]; bool unpriv_array; - bool frozen; /* write-once */ - /* 48 bytes hole */ + bool frozen; /* write-once; write-protected by freeze_mutex */ + /* 22 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. @@ -106,7 +109,8 @@ struct bpf_map { atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; struct work_struct work; - char name[BPF_OBJ_NAME_LEN]; + struct mutex freeze_mutex; + u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */ }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -795,6 +799,7 @@ void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); void *bpf_map_area_alloc(size_t size, int numa_node); +void *bpf_map_area_mmapable_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 4e7809408073..b4c58a191eb1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -93,6 +93,7 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); +extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4842a134b202..dbbcf0b02970 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -348,6 +348,9 @@ enum bpf_attach_type { /* Clone map from listener for newly accepted socket */ #define BPF_F_CLONE (1U << 9) +/* Enable memory-mapping BPF map */ +#define BPF_F_MMAPABLE (1U << 10) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) -- cgit v1.2.3 From c9eb55db8439057165f106164622c146cdd59468 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 16 Sep 2019 11:30:58 -0700 Subject: btrfs: get rid of pointless wtag variable in async-thread.c Commit ac0c7cf8be00 ("btrfs: fix crash when tracepoint arguments are freed by wq callbacks") added a void pointer, wtag, which is passed into trace_btrfs_all_work_done() instead of the freed work item. This is silly for a few reasons: 1. The freed work item still has the same address. 2. work is still in scope after it's freed, so assigning wtag doesn't stop anyone from using it. 3. The tracepoint has always taken a void * argument, so assigning wtag doesn't actually make things any more type-safe. (Note that the original bug in commit bc074524e123 ("btrfs: prefix fsid to all trace events") was that the void * was implicitly casted when it was passed to btrfs_work_owner() in the trace point itself). Instead, let's add some clearer warnings as comments. Reviewed-by: Nikolay Borisov Reviewed-by: Filipe Manana Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 75ae1899452b..8ca7401bc2fb 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1389,9 +1389,9 @@ DECLARE_EVENT_CLASS(btrfs__work, ); /* - * For situiations when the work is freed, we pass fs_info and a tag that that - * matches address of the work structure so it can be paired with the - * scheduling event. + * For situations when the work is freed, we pass fs_info and a tag that matches + * the address of the work structure so it can be paired with the scheduling + * event. DO NOT add anything here that dereferences wtag. */ DECLARE_EVENT_CLASS(btrfs__work__done, -- cgit v1.2.3 From b9b1a53e180e2ab4cea42c95bf344e4bf7f524e7 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 10 Oct 2019 15:59:58 +0800 Subject: btrfs: use enum for extent type defines Use enum to replace macro definitions of extent types. Signed-off-by: Chengguang Xu Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index b65c7ee75bc7..44136de2f5d7 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -737,10 +737,12 @@ struct btrfs_balance_item { __le64 unused[4]; } __attribute__ ((__packed__)); -#define BTRFS_FILE_EXTENT_INLINE 0 -#define BTRFS_FILE_EXTENT_REG 1 -#define BTRFS_FILE_EXTENT_PREALLOC 2 -#define BTRFS_FILE_EXTENT_TYPES 2 +enum { + BTRFS_FILE_EXTENT_INLINE = 0, + BTRFS_FILE_EXTENT_REG = 1, + BTRFS_FILE_EXTENT_PREALLOC = 2, + BTRFS_NR_FILE_EXTENT_TYPES = 3, +}; struct btrfs_file_extent_item { /* -- cgit v1.2.3 From 94c3f6c6b804c85c91aae0494c205ec7c81ce8d1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 17 Oct 2019 13:28:55 +0200 Subject: btrfs: tracepoints: drop typecasts from printk Remove typecasts from trace printk, adjust types and move typecast to the assignment if necessary. When assigning, the types are more obvious compared to matching the variables to the format strings. Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8ca7401bc2fb..671bf866a459 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -170,7 +170,7 @@ DECLARE_EVENT_CLASS(btrfs__inode, TP_STRUCT__entry_btrfs( __field( u64, ino ) - __field( blkcnt_t, blocks ) + __field( u64, blocks ) __field( u64, disk_i_size ) __field( u64, generation ) __field( u64, last_trans ) @@ -194,7 +194,7 @@ DECLARE_EVENT_CLASS(btrfs__inode, show_root_type(__entry->root_objectid), __entry->generation, __entry->ino, - (unsigned long long)__entry->blocks, + __entry->blocks, __entry->disk_i_size, __entry->last_trans, __entry->logged_trans) @@ -574,7 +574,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage, __field( char, for_kupdate ) __field( char, for_reclaim ) __field( char, range_cyclic ) - __field( pgoff_t, writeback_index ) + __field( unsigned long, writeback_index ) __field( u64, root_objectid ) ), @@ -603,7 +603,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage, __entry->range_start, __entry->range_end, __entry->for_kupdate, __entry->for_reclaim, __entry->range_cyclic, - (unsigned long)__entry->writeback_index) + __entry->writeback_index) ); DEFINE_EVENT(btrfs__writepage, __extent_writepage, @@ -622,7 +622,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook, TP_STRUCT__entry_btrfs( __field( u64, ino ) - __field( pgoff_t, index ) + __field( unsigned long, index ) __field( u64, start ) __field( u64, end ) __field( int, uptodate ) @@ -642,7 +642,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook, TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu start=%llu " "end=%llu uptodate=%d", show_root_type(__entry->root_objectid), - __entry->ino, (unsigned long)__entry->index, + __entry->ino, __entry->index, __entry->start, __entry->end, __entry->uptodate) ); @@ -1325,17 +1325,17 @@ TRACE_EVENT(alloc_extent_state, TP_STRUCT__entry( __field(const struct extent_state *, state) __field(gfp_t, mask) - __field(unsigned long, ip) + __field(const void*, ip) ), TP_fast_assign( __entry->state = state, __entry->mask = mask, - __entry->ip = IP + __entry->ip = (const void *)IP ), TP_printk("state=%p mask=%s caller=%pS", __entry->state, - show_gfp_flags(__entry->mask), (const void *)__entry->ip) + show_gfp_flags(__entry->mask), __entry->ip) ); TRACE_EVENT(free_extent_state, @@ -1346,16 +1346,15 @@ TRACE_EVENT(free_extent_state, TP_STRUCT__entry( __field(const struct extent_state *, state) - __field(unsigned long, ip) + __field(const void*, ip) ), TP_fast_assign( __entry->state = state, - __entry->ip = IP + __entry->ip = (const void *)IP ), - TP_printk("state=%p caller=%pS", __entry->state, - (const void *)__entry->ip) + TP_printk("state=%p caller=%pS", __entry->state, __entry->ip) ); DECLARE_EVENT_CLASS(btrfs__work, @@ -1567,8 +1566,7 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent, ), TP_printk_btrfs("bytenr=%llu num_bytes=%llu", - (unsigned long long)__entry->bytenr, - (unsigned long long)__entry->num_bytes) + __entry->bytenr, __entry->num_bytes) ); DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents, -- cgit v1.2.3 From 1d2e7c7c3ed73cc510a4dc093df2a935092ff5ad Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 17 Oct 2019 13:28:57 +0200 Subject: btrfs: tracepoints: constify all pointers We don't modify the data passed to tracepoints, some of the declarations are already const, add it to the rest. Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 52 ++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 671bf866a459..522d1f2b13e3 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -292,7 +292,7 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, TRACE_EVENT(btrfs_handle_em_exist, - TP_PROTO(struct btrfs_fs_info *fs_info, + TP_PROTO(const struct btrfs_fs_info *fs_info, const struct extent_map *existing, const struct extent_map *map, u64 start, u64 len), @@ -330,8 +330,8 @@ TRACE_EVENT(btrfs_handle_em_exist, /* file extent item */ DECLARE_EVENT_CLASS(btrfs__file_extent_item_regular, - TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, - struct btrfs_file_extent_item *fi, u64 start), + TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, + const struct btrfs_file_extent_item *fi, u64 start), TP_ARGS(bi, l, fi, start), @@ -385,8 +385,8 @@ DECLARE_EVENT_CLASS(btrfs__file_extent_item_regular, DECLARE_EVENT_CLASS( btrfs__file_extent_item_inline, - TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, - struct btrfs_file_extent_item *fi, int slot, u64 start), + TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, + const struct btrfs_file_extent_item *fi, int slot, u64 start), TP_ARGS(bi, l, fi, slot, start), @@ -426,8 +426,8 @@ DECLARE_EVENT_CLASS( DEFINE_EVENT( btrfs__file_extent_item_regular, btrfs_get_extent_show_fi_regular, - TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, - struct btrfs_file_extent_item *fi, u64 start), + TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, + const struct btrfs_file_extent_item *fi, u64 start), TP_ARGS(bi, l, fi, start) ); @@ -435,8 +435,8 @@ DEFINE_EVENT( DEFINE_EVENT( btrfs__file_extent_item_regular, btrfs_truncate_show_fi_regular, - TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, - struct btrfs_file_extent_item *fi, u64 start), + TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, + const struct btrfs_file_extent_item *fi, u64 start), TP_ARGS(bi, l, fi, start) ); @@ -444,8 +444,8 @@ DEFINE_EVENT( DEFINE_EVENT( btrfs__file_extent_item_inline, btrfs_get_extent_show_fi_inline, - TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, - struct btrfs_file_extent_item *fi, int slot, u64 start), + TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, + const struct btrfs_file_extent_item *fi, int slot, u64 start), TP_ARGS(bi, l, fi, slot, start) ); @@ -453,8 +453,8 @@ DEFINE_EVENT( DEFINE_EVENT( btrfs__file_extent_item_inline, btrfs_truncate_show_fi_inline, - TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, - struct btrfs_file_extent_item *fi, int slot, u64 start), + TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l, + const struct btrfs_file_extent_item *fi, int slot, u64 start), TP_ARGS(bi, l, fi, slot, start) ); @@ -1018,7 +1018,7 @@ TRACE_EVENT(btrfs_cow_block, TRACE_EVENT(btrfs_space_reservation, - TP_PROTO(const struct btrfs_fs_info *fs_info, char *type, u64 val, + TP_PROTO(const struct btrfs_fs_info *fs_info, const char *type, u64 val, u64 bytes, int reserve), TP_ARGS(fs_info, type, val, bytes, reserve), @@ -1051,7 +1051,7 @@ TRACE_EVENT(btrfs_space_reservation, TRACE_EVENT(btrfs_trigger_flush, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes, - int flush, char *reason), + int flush, const char *reason), TP_ARGS(fs_info, flags, bytes, flush, reason), @@ -1642,7 +1642,7 @@ TRACE_EVENT(btrfs_qgroup_account_extent, TRACE_EVENT(qgroup_update_counters, TP_PROTO(const struct btrfs_fs_info *fs_info, - struct btrfs_qgroup *qgroup, + const struct btrfs_qgroup *qgroup, u64 cur_old_count, u64 cur_new_count), TP_ARGS(fs_info, qgroup, cur_old_count, cur_new_count), @@ -1823,7 +1823,7 @@ DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert, ); TRACE_EVENT(btrfs_inode_mod_outstanding_extents, - TP_PROTO(struct btrfs_root *root, u64 ino, int mod), + TP_PROTO(const struct btrfs_root *root, u64 ino, int mod), TP_ARGS(root, ino, mod), @@ -1904,7 +1904,7 @@ TRACE_EVENT(btrfs_set_extent_bit, TP_fast_assign_btrfs(tree->fs_info, __entry->owner = tree->owner; if (tree->private_data) { - struct inode *inode = tree->private_data; + const struct inode *inode = tree->private_data; __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->rootid = @@ -1943,7 +1943,7 @@ TRACE_EVENT(btrfs_clear_extent_bit, TP_fast_assign_btrfs(tree->fs_info, __entry->owner = tree->owner; if (tree->private_data) { - struct inode *inode = tree->private_data; + const struct inode *inode = tree->private_data; __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->rootid = @@ -1983,7 +1983,7 @@ TRACE_EVENT(btrfs_convert_extent_bit, TP_fast_assign_btrfs(tree->fs_info, __entry->owner = tree->owner; if (tree->private_data) { - struct inode *inode = tree->private_data; + const struct inode *inode = tree->private_data; __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->rootid = @@ -2092,8 +2092,8 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic); DECLARE_EVENT_CLASS(btrfs__space_info_update, - TP_PROTO(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, u64 old, s64 diff), + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, u64 old, s64 diff), TP_ARGS(fs_info, sinfo, old, diff), @@ -2115,16 +2115,16 @@ DECLARE_EVENT_CLASS(btrfs__space_info_update, DEFINE_EVENT(btrfs__space_info_update, update_bytes_may_use, - TP_PROTO(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, u64 old, s64 diff), + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, u64 old, s64 diff), TP_ARGS(fs_info, sinfo, old, diff) ); DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned, - TP_PROTO(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, u64 old, s64 diff), + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, u64 old, s64 diff), TP_ARGS(fs_info, sinfo, old, diff) ); -- cgit v1.2.3 From 496074f94b19574c77240d3b3f84cfb1097de51d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Nov 2019 14:31:28 -0800 Subject: blk-cgroup: cgroup_rstat_updated() shouldn't be called on cgroup1 Currently, cgroup rstat is supported only on cgroup2 hierarchy and rstat functions shouldn't be called on cgroup1 cgroups. While converting blk-cgroup core statistics to rstat, f73316482977 ("blk-cgroup: reimplement basic IO stats using cgroup rstat") accidentally ended up calling cgroup_rstat_updated() on cgroup1 cgroups causing crashes. Longer term, we probably should add cgroup1 support to rstat but for now let's mask the call directly. Fixes: f73316482977 ("blk-cgroup: reimplement basic IO stats using cgroup rstat") Tested-by: Faiz Abbas Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 48a66738143d..19394c77ed99 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -626,7 +626,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, bis->cur.ios[rwd]++; u64_stats_update_end(&bis->sync); - cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu); + if (cgroup_subsys_on_dfl(io_cgrp_subsys)) + cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu); put_cpu(); } -- cgit v1.2.3 From 3951e7f050ac6a38bbc859fc3cd6093890c31d1c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 7 Oct 2019 11:11:01 +0200 Subject: btrfs: add xxhash64 to checksumming algorithms Add xxhash64 to the list of possible checksumming algorithms used by BTRFS. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 44136de2f5d7..962312939a8f 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -302,6 +302,7 @@ /* csum types */ enum btrfs_csum_type { BTRFS_CSUM_TYPE_CRC32 = 0, + BTRFS_CSUM_TYPE_XXHASH = 1, }; /* -- cgit v1.2.3 From 3831bf0094abed51e71cbeca8b6edf8b88c2644b Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 7 Oct 2019 11:11:02 +0200 Subject: btrfs: add sha256 to checksumming algorithm Add sha256 to the list of possible checksumming algorithms used by BTRFS. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 962312939a8f..e2823c9b2061 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -303,6 +303,7 @@ enum btrfs_csum_type { BTRFS_CSUM_TYPE_CRC32 = 0, BTRFS_CSUM_TYPE_XXHASH = 1, + BTRFS_CSUM_TYPE_SHA256 = 2, }; /* -- cgit v1.2.3 From 352ae07b599a03b08a2149d9c2ff9c3479591af2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 7 Oct 2019 11:11:02 +0200 Subject: btrfs: add blake2b to checksumming algorithms Add blake2b (with 256 bit digest) to the list of possible checksumming algorithms used by BTRFS. Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index e2823c9b2061..5160be1d7332 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -304,6 +304,7 @@ enum btrfs_csum_type { BTRFS_CSUM_TYPE_CRC32 = 0, BTRFS_CSUM_TYPE_XXHASH = 1, BTRFS_CSUM_TYPE_SHA256 = 2, + BTRFS_CSUM_TYPE_BLAKE2 = 3, }; /* -- cgit v1.2.3 From bf38be65f3703d5ef3661c0a2802bc28e76b8f19 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 23 Oct 2019 18:48:11 +0200 Subject: btrfs: move block_group_item::used to block group For unknown reasons, the member 'used' in the block group struct is stored in the b-tree item and accessed everywhere using the special accessor helper. Let's unify it and make it a regular member and only update the item before writing it to the tree. The item is still being used for flags and chunk_objectid, there's some duplication until the item is removed in following patches. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 522d1f2b13e3..7b842b049ea3 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -716,8 +716,7 @@ TRACE_EVENT(btrfs_add_block_group, __entry->offset = block_group->key.objectid; __entry->size = block_group->key.offset; __entry->flags = block_group->flags; - __entry->bytes_used = - btrfs_block_group_used(&block_group->item); + __entry->bytes_used = block_group->used; __entry->bytes_super = block_group->bytes_super; __entry->create = create; ), @@ -1859,7 +1858,7 @@ DECLARE_EVENT_CLASS(btrfs__block_group, TP_fast_assign_btrfs(bg_cache->fs_info, __entry->bytenr = bg_cache->key.objectid, __entry->len = bg_cache->key.offset, - __entry->used = btrfs_block_group_used(&bg_cache->item); + __entry->used = bg_cache->used; __entry->flags = bg_cache->flags; ), -- cgit v1.2.3 From b3470b5dbe1300dea94191ae4b7d070be9a5cdc9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 23 Oct 2019 18:48:22 +0200 Subject: btrfs: add dedicated members for start and length of a block group The on-disk format of block group item makes use of the key that stores the offset and length. This is further used in the code, although this makes thing harder to understand. The key is also packed so the offset/length is not properly aligned as u64. Add start (key.objectid) and length (key.offset) members to block group and remove the embedded key. When the item is searched or written, a local variable for key is used. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 7b842b049ea3..070619891915 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -713,8 +713,8 @@ TRACE_EVENT(btrfs_add_block_group, ), TP_fast_assign_btrfs(fs_info, - __entry->offset = block_group->key.objectid; - __entry->size = block_group->key.offset; + __entry->offset = block_group->start; + __entry->size = block_group->length; __entry->flags = block_group->flags; __entry->bytes_used = block_group->used; __entry->bytes_super = block_group->bytes_super; @@ -1197,7 +1197,7 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent, ), TP_fast_assign_btrfs(block_group->fs_info, - __entry->bg_objectid = block_group->key.objectid; + __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; __entry->start = start; __entry->len = len; @@ -1245,7 +1245,7 @@ TRACE_EVENT(btrfs_find_cluster, ), TP_fast_assign_btrfs(block_group->fs_info, - __entry->bg_objectid = block_group->key.objectid; + __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; __entry->start = start; __entry->bytes = bytes; @@ -1272,7 +1272,7 @@ TRACE_EVENT(btrfs_failed_cluster_setup, ), TP_fast_assign_btrfs(block_group->fs_info, - __entry->bg_objectid = block_group->key.objectid; + __entry->bg_objectid = block_group->start; ), TP_printk_btrfs("block_group=%llu", __entry->bg_objectid) @@ -1296,7 +1296,7 @@ TRACE_EVENT(btrfs_setup_cluster, ), TP_fast_assign_btrfs(block_group->fs_info, - __entry->bg_objectid = block_group->key.objectid; + __entry->bg_objectid = block_group->start; __entry->flags = block_group->flags; __entry->start = cluster->window_start; __entry->max_size = cluster->max_size; @@ -1856,8 +1856,8 @@ DECLARE_EVENT_CLASS(btrfs__block_group, ), TP_fast_assign_btrfs(bg_cache->fs_info, - __entry->bytenr = bg_cache->key.objectid, - __entry->len = bg_cache->key.offset, + __entry->bytenr = bg_cache->start, + __entry->len = bg_cache->length, __entry->used = bg_cache->used; __entry->flags = bg_cache->flags; ), -- cgit v1.2.3 From 47e6f7423b9196ad6832d26cae52b7015f81ee7f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Mar 2018 22:56:53 +0100 Subject: btrfs: add support for 3-copy replication (raid1c3) Add new block group profile to store 3 copies in a simliar way that current RAID1 does. The profile attributes and constraints are defined in the raid table and used by the same code that already handles the 2-copy RAID1. The minimum number of devices is 3, the maximum number of devices/chunks that can be lost/damaged is 2. Like RAID6 but with 33% space utilization. Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 3 ++- include/uapi/linux/btrfs_tree.h | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 3ee0678c0a83..ba22f91a3f5b 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -831,7 +831,8 @@ enum btrfs_err_code { BTRFS_ERROR_DEV_TGT_REPLACE, BTRFS_ERROR_DEV_MISSING_NOT_FOUND, BTRFS_ERROR_DEV_ONLY_WRITABLE, - BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS + BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS, + BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, }; #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 5160be1d7332..52b2964b0311 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -841,6 +841,7 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) #define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) +#define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ BTRFS_SPACE_INFO_GLOBAL_RSV) @@ -852,6 +853,7 @@ enum btrfs_raid_types { BTRFS_RAID_SINGLE, BTRFS_RAID_RAID5, BTRFS_RAID_RAID6, + BTRFS_RAID_RAID1C3, BTRFS_NR_RAID_TYPES }; @@ -861,6 +863,7 @@ enum btrfs_raid_types { #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID1C3 | \ BTRFS_BLOCK_GROUP_RAID5 | \ BTRFS_BLOCK_GROUP_RAID6 | \ BTRFS_BLOCK_GROUP_DUP | \ @@ -868,7 +871,8 @@ enum btrfs_raid_types { #define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ BTRFS_BLOCK_GROUP_RAID6) -#define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1) +#define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID1C3) /* * We need a bit for restriper to be able to tell when chunks of type -- cgit v1.2.3 From 8d6fac0087e538173f34ca7431ed9b58581acf28 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Mar 2018 22:56:53 +0100 Subject: btrfs: add support for 4-copy replication (raid1c4) Add new block group profile to store 4 copies in a simliar way that current RAID1 does. The profile attributes and constraints are defined in the raid table and used by the same code that already handles the 2- and 3-copy RAID1. The minimum number of devices is 4, the maximum number of devices/chunks that can be lost/damaged is 3. There is no comparable traditional RAID level, the profile is added for future needs to accompany triple-parity and beyond. Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 1 + include/uapi/linux/btrfs_tree.h | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index ba22f91a3f5b..a2b761275bba 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -833,6 +833,7 @@ enum btrfs_err_code { BTRFS_ERROR_DEV_ONLY_WRITABLE, BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS, BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, + BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, }; #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 52b2964b0311..8e322e2c7e78 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -842,6 +842,7 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) +#define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10) #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ BTRFS_SPACE_INFO_GLOBAL_RSV) @@ -854,6 +855,7 @@ enum btrfs_raid_types { BTRFS_RAID_RAID5, BTRFS_RAID_RAID6, BTRFS_RAID_RAID1C3, + BTRFS_RAID_RAID1C4, BTRFS_NR_RAID_TYPES }; @@ -864,6 +866,7 @@ enum btrfs_raid_types { #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID1 | \ BTRFS_BLOCK_GROUP_RAID1C3 | \ + BTRFS_BLOCK_GROUP_RAID1C4 | \ BTRFS_BLOCK_GROUP_RAID5 | \ BTRFS_BLOCK_GROUP_RAID6 | \ BTRFS_BLOCK_GROUP_DUP | \ @@ -872,7 +875,8 @@ enum btrfs_raid_types { BTRFS_BLOCK_GROUP_RAID6) #define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1 | \ - BTRFS_BLOCK_GROUP_RAID1C3) + BTRFS_BLOCK_GROUP_RAID1C3 | \ + BTRFS_BLOCK_GROUP_RAID1C4) /* * We need a bit for restriper to be able to tell when chunks of type -- cgit v1.2.3 From cfbb825c76198c9095428c5f9362fbf6ae06f417 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 10 Jul 2018 18:15:05 +0200 Subject: btrfs: add incompat for raid1 with 3, 4 copies The new raid1c3 and raid1c4 profiles are backward incompatible and the name shall be 'raid1c34', the status can be found in the global supported features in /sys/fs/btrfs/features or in the per-filesystem directory. Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index a2b761275bba..7a8bc8b920f5 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -270,6 +270,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) #define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) #define BTRFS_FEATURE_INCOMPAT_METADATA_UUID (1ULL << 10) +#define BTRFS_FEATURE_INCOMPAT_RAID1C34 (1ULL << 11) struct btrfs_ioctl_feature_flags { __u64 compat_flags; -- cgit v1.2.3 From 32da5386d9a4fd5c1155cecf703df104d918954c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 29 Oct 2019 19:20:18 +0100 Subject: btrfs: rename btrfs_block_group_cache The type name is misleading, a single entry is named 'cache' while this normally means a collection of objects. Rename that everywhere. Also the identifier was quite long, making function prototypes harder to format. Suggested-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 070619891915..620bf1b38fba 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -19,7 +19,7 @@ struct btrfs_delayed_ref_node; struct btrfs_delayed_tree_ref; struct btrfs_delayed_data_ref; struct btrfs_delayed_ref_head; -struct btrfs_block_group_cache; +struct btrfs_block_group; struct btrfs_free_cluster; struct map_lookup; struct extent_buffer; @@ -699,7 +699,7 @@ TRACE_EVENT(btrfs_sync_fs, TRACE_EVENT(btrfs_add_block_group, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_block_group_cache *block_group, int create), + const struct btrfs_block_group *block_group, int create), TP_ARGS(fs_info, block_group, create), @@ -1184,7 +1184,7 @@ TRACE_EVENT(find_free_extent, DECLARE_EVENT_CLASS(btrfs__reserve_extent, - TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start, + TP_PROTO(const struct btrfs_block_group *block_group, u64 start, u64 len), TP_ARGS(block_group, start, len), @@ -1214,7 +1214,7 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent, DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, - TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start, + TP_PROTO(const struct btrfs_block_group *block_group, u64 start, u64 len), TP_ARGS(block_group, start, len) @@ -1222,7 +1222,7 @@ DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent, DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, - TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start, + TP_PROTO(const struct btrfs_block_group *block_group, u64 start, u64 len), TP_ARGS(block_group, start, len) @@ -1230,7 +1230,7 @@ DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster, TRACE_EVENT(btrfs_find_cluster, - TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start, + TP_PROTO(const struct btrfs_block_group *block_group, u64 start, u64 bytes, u64 empty_size, u64 min_bytes), TP_ARGS(block_group, start, bytes, empty_size, min_bytes), @@ -1263,7 +1263,7 @@ TRACE_EVENT(btrfs_find_cluster, TRACE_EVENT(btrfs_failed_cluster_setup, - TP_PROTO(const struct btrfs_block_group_cache *block_group), + TP_PROTO(const struct btrfs_block_group *block_group), TP_ARGS(block_group), @@ -1280,7 +1280,7 @@ TRACE_EVENT(btrfs_failed_cluster_setup, TRACE_EVENT(btrfs_setup_cluster, - TP_PROTO(const struct btrfs_block_group_cache *block_group, + TP_PROTO(const struct btrfs_block_group *block_group, const struct btrfs_free_cluster *cluster, u64 size, int bitmap), @@ -1844,7 +1844,7 @@ TRACE_EVENT(btrfs_inode_mod_outstanding_extents, ); DECLARE_EVENT_CLASS(btrfs__block_group, - TP_PROTO(const struct btrfs_block_group_cache *bg_cache), + TP_PROTO(const struct btrfs_block_group *bg_cache), TP_ARGS(bg_cache), @@ -1868,19 +1868,19 @@ DECLARE_EVENT_CLASS(btrfs__block_group, ); DEFINE_EVENT(btrfs__block_group, btrfs_remove_block_group, - TP_PROTO(const struct btrfs_block_group_cache *bg_cache), + TP_PROTO(const struct btrfs_block_group *bg_cache), TP_ARGS(bg_cache) ); DEFINE_EVENT(btrfs__block_group, btrfs_add_unused_block_group, - TP_PROTO(const struct btrfs_block_group_cache *bg_cache), + TP_PROTO(const struct btrfs_block_group *bg_cache), TP_ARGS(bg_cache) ); DEFINE_EVENT(btrfs__block_group, btrfs_skip_unused_block_group, - TP_PROTO(const struct btrfs_block_group_cache *bg_cache), + TP_PROTO(const struct btrfs_block_group *bg_cache), TP_ARGS(bg_cache) ); -- cgit v1.2.3 From ae7c2d342a10dbef1e054482f46498b6282a1df0 Mon Sep 17 00:00:00 2001 From: Luhua Xu Date: Mon, 18 Nov 2019 12:57:16 +0800 Subject: spi: mediatek: add SPI_CS_HIGH support Change to use SPI_CS_HIGH to support spi CS polarity setting for chips support enhance_timing. Signed-off-by: Luhua Xu Link: https://lore.kernel.org/r/1574053037-26721-2-git-send-email-luhua.xu@mediatek.com Signed-off-by: Mark Brown --- include/linux/platform_data/spi-mt65xx.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/platform_data/spi-mt65xx.h b/include/linux/platform_data/spi-mt65xx.h index f0e6d6483e62..65fd5ffd257c 100644 --- a/include/linux/platform_data/spi-mt65xx.h +++ b/include/linux/platform_data/spi-mt65xx.h @@ -11,7 +11,6 @@ /* Board specific platform_data */ struct mtk_chip_config { - u32 cs_pol; u32 sample_sel; }; #endif -- cgit v1.2.3 From 298e54fa810e027f1b0800d789eb862592721f08 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 15 Nov 2019 19:56:51 +0000 Subject: net: phy: add core phylib sfp support Add core phylib help for supporting SFP sockets on PHYs. This provides a mechanism to inform the SFP layer about PHY up/down events, and also unregister the SFP bus when the PHY is going away. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 78436d58ce7c..124516fe2763 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -203,6 +203,8 @@ static inline const char *phy_modes(phy_interface_t interface) struct device; struct phylink; +struct sfp_bus; +struct sfp_upstream_ops; struct sk_buff; /* @@ -342,6 +344,8 @@ struct phy_c45_device_ids { * dev_flags: Device-specific flags used by the PHY driver. * irq: IRQ number of the PHY's interrupt (-1 if none) * phy_timer: The timer for handling the state machine + * sfp_bus_attached: flag indicating whether the SFP bus has been attached + * sfp_bus: SFP bus attached to this PHY's fiber port * attached_dev: The attached enet driver's device instance ptr * adjust_link: Callback for the enet controller to respond to * changes in the link state. @@ -432,6 +436,9 @@ struct phy_device { struct mutex lock; + /* This may be modified under the rtnl lock */ + bool sfp_bus_attached; + struct sfp_bus *sfp_bus; struct phylink *phylink; struct net_device *attached_dev; @@ -1020,6 +1027,10 @@ int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); +void phy_sfp_attach(void *upstream, struct sfp_bus *bus); +void phy_sfp_detach(void *upstream, struct sfp_bus *bus); +int phy_sfp_probe(struct phy_device *phydev, + const struct sfp_upstream_ops *ops); struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, phy_interface_t interface); struct phy_device *phy_find_first(struct mii_bus *bus); -- cgit v1.2.3 From 7c9e69428da39ed761c9d903c4850368fa4ef7bf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 16 Nov 2019 12:22:43 +0100 Subject: page_pool: add destroy attempts counter and rename tracepoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Jonathan change the page_pool to become responsible to its own shutdown via deferred work queue, then the disconnect_cnt counter was removed from xdp memory model tracepoint. This patch change the page_pool_inflight tracepoint name to page_pool_release, because it reflects the new responsability better. And it reintroduces a counter that reflect the number of times page_pool_release have been tried. The counter is also used by the code, to only empty the alloc cache once. With a stuck work queue running every second and counter being 64-bit, it will overrun in approx 584 billion years. For comparison, Earth lifetime expectancy is 7.5 billion years, before the Sun will engulf, and destroy, the Earth. Signed-off-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/net/page_pool.h | 2 ++ include/trace/events/page_pool.h | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 1121faa99c12..ace881c15dcb 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -112,6 +112,8 @@ struct page_pool { * refcnt serves purpose is to simplify drivers error handling. */ refcount_t user_cnt; + + u64 destroy_cnt; }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index 47b5ee880aa9..ee7f1aca7839 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -10,7 +10,7 @@ #include -TRACE_EVENT(page_pool_inflight, +TRACE_EVENT(page_pool_release, TP_PROTO(const struct page_pool *pool, s32 inflight, u32 hold, u32 release), @@ -22,6 +22,7 @@ TRACE_EVENT(page_pool_inflight, __field(s32, inflight) __field(u32, hold) __field(u32, release) + __field(u64, cnt) ), TP_fast_assign( @@ -29,10 +30,12 @@ TRACE_EVENT(page_pool_inflight, __entry->inflight = inflight; __entry->hold = hold; __entry->release = release; + __entry->cnt = pool->destroy_cnt; ), - TP_printk("page_pool=%p inflight=%d hold=%u release=%u", - __entry->pool, __entry->inflight, __entry->hold, __entry->release) + TP_printk("page_pool=%p inflight=%d hold=%u release=%u cnt=%llu", + __entry->pool, __entry->inflight, __entry->hold, + __entry->release, __entry->cnt) ); TRACE_EVENT(page_pool_state_release, -- cgit v1.2.3 From 832ccf6f80cda06ad2373cd1f40291b0183958b4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 16 Nov 2019 12:22:48 +0100 Subject: page_pool: extend tracepoint to also include the page PFN The MM tracepoint for page free (called kmem:mm_page_free) doesn't provide the page pointer directly, instead it provides the PFN (Page Frame Number). This is annoying when writing a page_pool leak detector in BPF. This patch change page_pool tracepoints to also provide the PFN. The page pointer is still provided to allow other kinds of troubleshooting from BPF. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/trace/events/page_pool.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index ee7f1aca7839..2f2a10e8eb56 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -8,6 +8,7 @@ #include #include +#include #include TRACE_EVENT(page_pool_release, @@ -49,16 +50,18 @@ TRACE_EVENT(page_pool_state_release, __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, release) + __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->release = release; + __entry->pfn = page_to_pfn(page); ), - TP_printk("page_pool=%p page=%p release=%u", - __entry->pool, __entry->page, __entry->release) + TP_printk("page_pool=%p page=%p pfn=%lu release=%u", + __entry->pool, __entry->page, __entry->pfn, __entry->release) ); TRACE_EVENT(page_pool_state_hold, @@ -72,16 +75,18 @@ TRACE_EVENT(page_pool_state_hold, __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, hold) + __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->hold = hold; + __entry->pfn = page_to_pfn(page); ), - TP_printk("page_pool=%p page=%p hold=%u", - __entry->pool, __entry->page, __entry->hold) + TP_printk("page_pool=%p page=%p pfn=%lu hold=%u", + __entry->pool, __entry->page, __entry->pfn, __entry->hold) ); #endif /* _TRACE_PAGE_POOL_H */ -- cgit v1.2.3 From d4ffb02dee2fcb20e0c8086a8d1305bf885820bb Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 18 Nov 2019 10:40:51 -0500 Subject: net/tls: enable sk_msg redirect to tls socket egress Bring back tls_sw_sendpage_locked. sk_msg redirection into a socket with TLS_TX takes the following path: tcp_bpf_sendmsg_redir tcp_bpf_push_locked tcp_bpf_push kernel_sendpage_locked sock->ops->sendpage_locked Also update the flags test in tls_sw_sendpage_locked to allow flag MSG_NO_SHARED_FRAGS. bpf_tcp_sendmsg sets this. Link: https://lore.kernel.org/netdev/CA+FuTSdaAawmZ2N8nfDDKu3XLpXBbMtcCT0q4FntDD2gn8ASUw@mail.gmail.com/T/#t Link: https://github.com/wdebruij/kerneltools/commits/icept.2 Fixes: 0608c69c9a80 ("bpf: sk_msg, sock{map|hash} redirect through ULP") Fixes: f3de19af0f5b ("Revert \"net/tls: remove unused function tls_sw_sendpage_locked\"") Signed-off-by: Willem de Bruijn Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/tls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 794e297483ea..f4ad831eaa02 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -356,6 +356,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); void tls_sw_strparser_done(struct tls_context *tls_ctx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +int tls_sw_sendpage_locked(struct sock *sk, struct page *page, + int offset, size_t size, int flags); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_cancel_work_tx(struct tls_context *tls_ctx); -- cgit v1.2.3 From 7cd9a58d6860ae09acd7f0c219b5fa333703f72f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Nov 2019 23:05:52 +0100 Subject: netfilter: nf_tables: constify nft_reg_load{8, 16, 64}() This patch constifies the pointer to source register data that is passed as an input parameter. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_tables.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 87b758407868..fe7c50acc681 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -114,7 +114,7 @@ static inline void nft_reg_store8(u32 *dreg, u8 val) *(u8 *)dreg = val; } -static inline u8 nft_reg_load8(u32 *sreg) +static inline u8 nft_reg_load8(const u32 *sreg) { return *(u8 *)sreg; } @@ -125,7 +125,7 @@ static inline void nft_reg_store16(u32 *dreg, u16 val) *(u16 *)dreg = val; } -static inline u16 nft_reg_load16(u32 *sreg) +static inline u16 nft_reg_load16(const u32 *sreg) { return *(u16 *)sreg; } @@ -135,7 +135,7 @@ static inline void nft_reg_store64(u32 *dreg, u64 val) put_unaligned(val, (u64 *)dreg); } -static inline u64 nft_reg_load64(u32 *sreg) +static inline u64 nft_reg_load64(const u32 *sreg) { return get_unaligned((u64 *)sreg); } -- cgit v1.2.3 From 8819efc9430142957c9c8fc7c09d9107e2061b87 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Nov 2019 23:05:53 +0100 Subject: netfilter: nf_tables_offload: allow ethernet interface type only Hardware offload support at this stage assumes an ethernet device in place. The flow dissector provides the intermediate representation to express this selector, so extend it to allow to store the interface type. Flower does not uses this, so skb_flow_dissect_meta() is not extended to match on this new field. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index b1063db63e66..1a0727d1acfa 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -203,9 +203,11 @@ struct flow_dissector_key_ip { /** * struct flow_dissector_key_meta: * @ingress_ifindex: ingress ifindex + * @ingress_iftype: ingress interface type */ struct flow_dissector_key_meta { int ingress_ifindex; + u16 ingress_iftype; }; /** -- cgit v1.2.3 From a82055af595946aea461528e551e6ae064b3d560 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Nov 2019 23:05:54 +0100 Subject: netfilter: nft_payload: add VLAN offload support Match on ethertype and set up protocol dependency. Check for protocol dependency before accessing the tci field. Allow to match on the encapsulated ethertype too. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 1a0727d1acfa..f06b0239c32b 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -48,9 +48,12 @@ struct flow_dissector_key_tags { }; struct flow_dissector_key_vlan { - u16 vlan_id:12, - vlan_dei:1, - vlan_priority:3; + union { + u16 vlan_id:12, + vlan_dei:1, + vlan_priority:3; + __be16 vlan_tci; + }; __be16 vlan_tpid; }; -- cgit v1.2.3 From bc836748707cf6b8b1a948b61149278f109107da Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 20 Nov 2019 00:15:17 +0000 Subject: page_pool: Add API to update numa node Add page_pool_update_nid() to be called by page pool consumers when they detect numa node changes. It will update the page pool nid value to start allocating from the new effective numa node. This is to mitigate page pool allocating pages from a wrong numa node, where the pool was originally allocated, and holding on to pages that belong to a different numa node, which causes performance degradation. For pages that are already being consumed and could be returned to the pool by the consumer, in next patch we will add a check per page to avoid recycling them back to the pool and return them to the page allocator. Signed-off-by: Saeed Mahameed Acked-by: Jonathan Lemon Reviewed-by: Ilias Apalodimas Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 7 +++++++ include/trace/events/page_pool.h | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index ace881c15dcb..e2e1b7b1e8ba 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -204,4 +204,11 @@ static inline bool page_pool_put(struct page_pool *pool) return refcount_dec_and_test(&pool->user_cnt); } +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid); +static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) +{ + if (unlikely(pool->p.nid != new_nid)) + page_pool_update_nid(pool, new_nid); +} #endif /* _NET_PAGE_POOL_H */ diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index 2f2a10e8eb56..ad0aa7f31675 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -89,6 +89,28 @@ TRACE_EVENT(page_pool_state_hold, __entry->pool, __entry->page, __entry->pfn, __entry->hold) ); +TRACE_EVENT(page_pool_update_nid, + + TP_PROTO(const struct page_pool *pool, int new_nid), + + TP_ARGS(pool, new_nid), + + TP_STRUCT__entry( + __field(const struct page_pool *, pool) + __field(int, pool_nid) + __field(int, new_nid) + ), + + TP_fast_assign( + __entry->pool = pool; + __entry->pool_nid = pool->p.nid; + __entry->new_nid = new_nid; + ), + + TP_printk("page_pool=%p pool_nid=%d new_nid=%d", + __entry->pool, __entry->pool_nid, __entry->new_nid) +); + #endif /* _TRACE_PAGE_POOL_H */ /* This part must be outside protection */ -- cgit v1.2.3 From cec2975f2b7058c42330a0f8164d94c6b7c8c446 Mon Sep 17 00:00:00 2001 From: Gautam Ramakrishnan Date: Wed, 20 Nov 2019 19:43:54 +0530 Subject: net: sched: pie: enable timestamp based delay calculation RFC 8033 suggests an alternative approach to calculate the queue delay in PIE by using a timestamp on every enqueued packet. This patch adds an implementation of that approach and sets it as the default method to calculate queue delay. The previous method (based on Little's law) to calculate queue delay is set as optional. Signed-off-by: Gautam Ramakrishnan Signed-off-by: Leslie Monis Signed-off-by: Mohit P. Tahiliani Acked-by: Dave Taht Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 5011259b8f67..9f1a72876212 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -950,19 +950,25 @@ enum { TCA_PIE_BETA, TCA_PIE_ECN, TCA_PIE_BYTEMODE, + TCA_PIE_DQ_RATE_ESTIMATOR, __TCA_PIE_MAX }; #define TCA_PIE_MAX (__TCA_PIE_MAX - 1) struct tc_pie_xstats { - __u64 prob; /* current probability */ - __u32 delay; /* current delay in ms */ - __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ - __u32 packets_in; /* total number of packets enqueued */ - __u32 dropped; /* packets dropped due to pie_action */ - __u32 overlimit; /* dropped due to lack of space in queue */ - __u32 maxq; /* maximum queue size */ - __u32 ecn_mark; /* packets marked with ecn*/ + __u64 prob; /* current probability */ + __u32 delay; /* current delay in ms */ + __u32 avg_dq_rate; /* current average dq_rate in + * bits/pie_time + */ + __u32 dq_rate_estimating; /* is avg_dq_rate being calculated? */ + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to pie_action */ + __u32 overlimit; /* dropped due to lack of space + * in queue + */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ }; /* CBS */ -- cgit v1.2.3 From e68bc75691cc3de608c2c7505057c948d13ae587 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 20 Nov 2019 16:54:18 +0200 Subject: net: page_pool: add the possibility to sync DMA memory for device Introduce the following parameters in order to add the possibility to sync DMA memory for device before putting allocated pages in the page_pool caches: - PP_FLAG_DMA_SYNC_DEV: if set in page_pool_params flags, all pages that the driver gets from page_pool will be DMA-synced-for-device according to the length provided by the device driver. Please note DMA-sync-for-CPU is still device driver responsibility - offset: DMA address offset where the DMA engine starts copying rx data - max_len: maximum DMA memory size page_pool is allowed to flush. This is currently used in __page_pool_alloc_pages_slow routine when pages are allocated from page allocator These parameters are supposed to be set by device drivers. This optimization reduces the length of the DMA-sync-for-device. The optimization is valid because pages are initially DMA-synced-for-device as defined via max_len. At RX time, the driver will perform a DMA-sync-for-CPU on the memory for the packet length. What is important is the memory occupied by packet payload, because this is the area CPU is allowed to read and modify. As we don't track cache-lines written into by the CPU, simply use the packet payload length as dma_sync_size at page_pool recycle time. This also take into account any tail-extend. Tested-by: Matteo Croce Signed-off-by: Lorenzo Bianconi Signed-off-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index e2e1b7b1e8ba..cfbed00ba7ee 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -34,8 +34,18 @@ #include #include -#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */ -#define PP_FLAG_ALL PP_FLAG_DMA_MAP +#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA + * map/unmap + */ +#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets + * from page_pool will be + * DMA-synced-for-device according to + * the length provided by the device + * driver. + * Please note DMA-sync-for-CPU is still + * device driver responsibility + */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) /* * Fast allocation side cache array/stack @@ -65,6 +75,8 @@ struct page_pool_params { int nid; /* Numa node id to allocate from pages from */ struct device *dev; /* device, for DMA pre-mapping purposes */ enum dma_data_direction dma_dir; /* DMA mapping direction */ + unsigned int max_len; /* max DMA sync memory size */ + unsigned int offset; /* DMA addr offset */ }; struct page_pool { @@ -151,8 +163,8 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, #endif /* Never call this directly, use helpers below */ -void __page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct); +void __page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct); static inline void page_pool_put_page(struct page_pool *pool, struct page *page, bool allow_direct) @@ -161,14 +173,14 @@ static inline void page_pool_put_page(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - __page_pool_put_page(pool, page, allow_direct); + __page_pool_put_page(pool, page, -1, allow_direct); #endif } /* Very limited use-cases allow recycle direct */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { - __page_pool_put_page(pool, page, true); + __page_pool_put_page(pool, page, -1, true); } /* Disconnects a page (from a page_pool). API users can have a need -- cgit v1.2.3 From 91e6015b082b08a74e5d9d326f651e5890a93519 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Nov 2019 22:38:16 +0100 Subject: bpf: Emit audit messages upon successful prog load and unload Allow for audit messages to be emitted upon BPF program load and unload for having a timeline of events. The load itself is in syscall context, so additional info about the process initiating the BPF prog creation can be logged and later directly correlated to the unload event. The only info really needed from BPF side is the globally unique prog ID where then audit user space tooling can query / dump all info needed about the specific BPF program right upon load event and enrich the record, thus these changes needed here can be kept small and non-intrusive to the core. Raw example output: # auditctl -D # auditctl -a always,exit -F arch=x86_64 -S bpf # ausearch --start recent -m 1334 [...] ---- time->Wed Nov 20 12:45:51 2019 type=PROCTITLE msg=audit(1574271951.590:8974): proctitle="./test_verifier" type=SYSCALL msg=audit(1574271951.590:8974): arch=c000003e syscall=321 success=yes exit=14 a0=5 a1=7ffe2d923e80 a2=78 a3=0 items=0 ppid=742 pid=949 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=2 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1334] msg=audit(1574271951.590:8974): auid=0 uid=0 gid=0 ses=2 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 pid=949 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" prog-id=3260 event=LOAD ---- time->Wed Nov 20 12:45:51 2019 type=UNKNOWN[1334] msg=audit(1574271951.590:8975): prog-id=3260 event=UNLOAD ---- [...] Signed-off-by: Daniel Borkmann Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191120213816.8186-1-jolsa@kernel.org --- include/linux/audit.h | 3 +++ include/uapi/linux/audit.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index aee3dc9eb378..edd006f4597d 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -159,6 +159,7 @@ extern void audit_log_key(struct audit_buffer *ab, extern void audit_log_link_denied(const char *operation); extern void audit_log_lost(const char *message); +extern void audit_log_task(struct audit_buffer *ab); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -219,6 +220,8 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_link_denied(const char *string) { } +static inline void audit_log_task(struct audit_buffer *ab) +{ } static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index c89c6495983d..32a5db900f47 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -116,6 +116,7 @@ #define AUDIT_FANOTIFY 1331 /* Fanotify access decision */ #define AUDIT_TIME_INJOFFSET 1332 /* Timekeeping offset injected */ #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */ +#define AUDIT_BPF 1334 /* BPF subsystem */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ -- cgit v1.2.3 From 196e8ca74886c433dcfc64a809707074b936aaf5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Nov 2019 23:04:44 +0100 Subject: bpf: Switch bpf_map_{area_alloc,area_mmapable_alloc}() to u64 size Given we recently extended the original bpf_map_area_alloc() helper in commit fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY"), we need to apply the same logic as in ff1c08e1f74b ("bpf: Change size to u64 for bpf_map_{area_alloc, charge_init}()"). To avoid conflicts, extend it for bpf-next. Reported-by: Stephen Rothwell Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e913dd5946ae..e89e86122233 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -794,12 +794,12 @@ void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); -int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size); +int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size); void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); -void *bpf_map_area_alloc(size_t size, int numa_node); -void *bpf_map_area_mmapable_alloc(size_t size, int numa_node); +void *bpf_map_area_alloc(u64 size, int numa_node); +void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); -- cgit v1.2.3 From f3c9a666b28572b1a0ae691a47d9a7de4d9cefb3 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 20 Nov 2019 12:29:59 +0000 Subject: net: sfp: soft status and control support Add support for the soft status and control register, which allows TX_FAULT and RX_LOS to be monitored and TX_DISABLE to be set. We make use of this when the board does not support GPIOs for these signals. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/sfp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 3b35efd85bb1..487fd9412d10 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -428,6 +428,10 @@ enum { SFP_TEC_CUR = 0x6c, SFP_STATUS = 0x6e, + SFP_STATUS_TX_DISABLE = BIT(7), + SFP_STATUS_TX_DISABLE_FORCE = BIT(6), + SFP_STATUS_TX_FAULT = BIT(2), + SFP_STATUS_RX_LOS = BIT(1), SFP_ALARM0 = 0x70, SFP_ALARM0_TEMP_HIGH = BIT(7), SFP_ALARM0_TEMP_LOW = BIT(6), -- cgit v1.2.3 From b6866318657717c8914673a6394894d12bc9ff5e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 21 Nov 2019 13:40:26 +0300 Subject: block: add iostat counters for flush requests Requests that triggers flushing volatile writeback cache to disk (barriers) have significant effect to overall performance. Block layer has sophisticated engine for combining several flush requests into one. But there is no statistics for actual flushes executed by disk. Requests which trigger flushes usually are barriers - zero-size writes. This patch adds two iostat counters into /sys/class/block/$dev/stat and /proc/diskstats - count of completed flush requests and their total time. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 23a2fd534817..70254ae11769 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -377,6 +377,7 @@ enum stat_group { STAT_READ, STAT_WRITE, STAT_DISCARD, + STAT_FLUSH, NR_STAT_GROUPS }; -- cgit v1.2.3 From 52deba0f02a98c150677a9c381cc1991a928bcff Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 15 Nov 2019 00:40:00 +0900 Subject: nvme: hwmon: provide temperature min and max values for each sensor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the NVMe specification, the over temperature threshold and under temperature threshold features shall be implemented for Composite Temperature if a non-zero WCTEMP field value is reported in the Identify Controller data structure. The features are also implemented for all implemented temperature sensors (i.e., all Temperature Sensor fields that report a non-zero value). This provides the over temperature threshold and under temperature threshold for each sensor as temperature min and max values of hwmon sysfs attributes. The WCTEMP is already provided as a temperature max value for Composite Temperature, but this change isn't incompatible. Because the default value of the over temperature threshold for Composite Temperature is the WCTEMP. Now the alarm attribute for Composite Temperature indicates one of the temperature is outside of a temperature threshold. Because there is only a single bit in Critical Warning field that indicates a temperature is outside of a threshold. Example output from the "sensors" command: nvme-pci-0100 Adapter: PCI adapter Composite: +33.9°C (low = -273.1°C, high = +69.8°C) (crit = +79.8°C) Sensor 1: +34.9°C (low = -273.1°C, high = +65261.8°C) Sensor 2: +31.9°C (low = -273.1°C, high = +65261.8°C) Sensor 5: +47.9°C (low = -273.1°C, high = +65261.8°C) This also adds helper macros for kelvin from/to milli Celsius conversion, and replaces the repeated code in hwmon.c. Cc: Keith Busch Cc: Jens Axboe Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: Jean Delvare Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Akinobu Mita Signed-off-by: Keith Busch --- include/linux/nvme.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3eca4f7d8510..3d5189f46cb1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -804,6 +804,12 @@ struct nvme_write_zeroes_cmd { /* Features */ +enum { + NVME_TEMP_THRESH_MASK = 0xffff, + NVME_TEMP_THRESH_SELECT_SHIFT = 16, + NVME_TEMP_THRESH_TYPE_UNDER = 0x100000, +}; + struct nvme_feat_auto_pst { __le64 entries[32]; }; -- cgit v1.2.3 From fca3f91cc38ad866c995fb099d961b31cd687849 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:26 +0800 Subject: net: sched: add vxlan option support to act_tunnel_key This patch is to allow setting vxlan options using the act_tunnel_key action. Different from geneve options, only one option can be set. And also, geneve options and vxlan options can't be set at the same time. gbp is the only param for vxlan options: # ip link add name vxlan0 type vxlan dstport 0 external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ vxlan_opts 01020304 \ action mirred egress redirect dev vxlan0 v1->v2: - add .strict_start_type for enc_opts_policy as Jakub noticed. - use Duplicate instead of Wrong in err msg for extack as Jakub suggested. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index 41c8b462c177..f302c2a76953 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -50,6 +50,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_ * attributes */ + TCA_TUNNEL_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ __TCA_TUNNEL_KEY_ENC_OPTS_MAX, }; @@ -67,4 +71,13 @@ enum { #define TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX - 1) +enum { + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1) + #endif -- cgit v1.2.3 From e20d4ff2acd7db2ffce64a6ddbdaeec43a8eec19 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:27 +0800 Subject: net: sched: add erspan option support to act_tunnel_key This patch is to allow setting erspan options using the act_tunnel_key action. Different from geneve options, only one option can be set. And also, geneve options, vxlan options or erspan options can't be set at the same time. Options are expressed as ver:index:dir:hwid, when ver is set to 1, index will be applied while dir and hwid will be ignored, and when ver is set to 2, dir and hwid will be used while index will be ignored. # ip link add name erspan1 type erspan external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ erspan_opts 1:2:0:0 \ action mirred egress redirect dev erspan1 v1->v2: - do the validation when dst is not yet allocated as Jakub suggested. - use Duplicate instead of Wrong in err msg for extack. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index f302c2a76953..3f10dc4e7a4b 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -54,6 +54,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_ * attributes */ + TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ __TCA_TUNNEL_KEY_ENC_OPTS_MAX, }; @@ -80,4 +84,16 @@ enum { #define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \ (__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1) +enum { + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, /* be32 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID, /* u8 */ + __TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX - 1) + #endif -- cgit v1.2.3 From d8f9dfae49ce4ffb772dc10dd6578dc815b34c12 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:28 +0800 Subject: net: sched: allow flower to match vxlan options This patch is to allow matching gbp option in vxlan. The options can be described in the form GBP/GBP_MASK, where GBP is represented as a 32bit hexadecimal value. Different from geneve, only one option can be set. And also, geneve options and vxlan options can't be set at the same time. # ip link add name vxlan0 type vxlan dstport 0 external # tc qdisc add dev vxlan0 ingress # tc filter add dev vxlan0 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ vxlan_opts 01020304/ffffffff \ ip_proto udp \ action mirred egress redirect dev eth0 v1->v2: - add .strict_start_type for enc_opts_policy as Jakub noticed. - use Duplicate instead of Wrong in err msg for extack as Jakub suggested. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index c6ad22f76ede..929825d710e2 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -571,6 +571,10 @@ enum { * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_VXLAN_ + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, }; @@ -588,6 +592,15 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), -- cgit v1.2.3 From 79b1011cb33d166f531a1347a17e6602954e4eb1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:29 +0800 Subject: net: sched: allow flower to match erspan options This patch is to allow matching options in erspan. The options can be described in the form: VER:INDEX:DIR:HWID/VER:INDEX_MASK:DIR_MASK:HWID_MASK. When ver is set to 1, index will be applied while dir and hwid will be ignored, and when ver is set to 2, dir and hwid will be used while index will be ignored. Different from geneve, only one option can be set. And also, geneve options, vxlan options or erspan options can't be set at the same time. # ip link add name erspan1 type erspan external # tc qdisc add dev erspan1 ingress # tc filter add dev erspan1 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ erspan_opts 1:12:0:0/1:ffff:0:0 \ ip_proto udp \ action mirred egress redirect dev eth0 v1->v2: - improve some err msgs of extack. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 929825d710e2..449a63971451 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -575,6 +575,10 @@ enum { * TCA_FLOWER_KEY_ENC_OPT_VXLAN_ * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_ERSPAN, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_ERSPAN_ + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, }; @@ -601,6 +605,18 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, /* be32 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID, /* u8 */ + __TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), -- cgit v1.2.3 From 7599a896f2e46e9c072e02a8299a67d4d2f96675 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 21 Nov 2019 16:58:53 +0100 Subject: audit: Move audit_log_task declaration under CONFIG_AUDITSYSCALL The 0-DAY found that audit_log_task is not declared under CONFIG_AUDITSYSCALL which causes compilation error when it is not defined: kernel/bpf/syscall.o: In function `bpf_audit_prog.isra.30': >> syscall.c:(.text+0x860): undefined reference to `audit_log_task' Adding the audit_log_task declaration and stub within CONFIG_AUDITSYSCALL ifdef. Fixes: 91e6015b082b ("bpf: Emit audit messages upon successful prog load and unload") Reported-by: kbuild test robot Signed-off-by: Jiri Olsa Signed-off-by: David S. Miller --- include/linux/audit.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index edd006f4597d..18925d924c73 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -159,7 +159,6 @@ extern void audit_log_key(struct audit_buffer *ab, extern void audit_log_link_denied(const char *operation); extern void audit_log_lost(const char *message); -extern void audit_log_task(struct audit_buffer *ab); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -220,8 +219,6 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_link_denied(const char *string) { } -static inline void audit_log_task(struct audit_buffer *ab) -{ } static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; @@ -361,6 +358,8 @@ static inline void audit_ptrace(struct task_struct *t) __audit_ptrace(t); } +extern void audit_log_task(struct audit_buffer *ab); + /* Private API (for audit.c only) */ extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); @@ -648,6 +647,9 @@ static inline void audit_ntp_log(const struct audit_ntp_data *ad) static inline void audit_ptrace(struct task_struct *t) { } + +static inline void audit_log_task(struct audit_buffer *ab) +{ } #define audit_n_rules 0 #define audit_signals 0 #endif /* CONFIG_AUDITSYSCALL */ -- cgit v1.2.3 From f145922ddcaa1cb9688b3d053622c98d9f9a7fff Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 20 Nov 2019 16:23:14 +0800 Subject: net: mscc: ocelot: export ocelot_hwstamp_get/set functions Export ocelot_hwstamp_get/set functions so that DSA driver is able to reuse them. Signed-off-by: Yangbo Lu Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index a836afe8f68e..2bac4bc34cf6 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -533,6 +533,8 @@ int ocelot_fdb_del(struct ocelot *ocelot, int port, int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, bool untagged); int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); +int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); +int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts); void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts); -- cgit v1.2.3 From e23a7b3e8daa4be3d91544d8ba210f96d2266de9 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 20 Nov 2019 16:23:15 +0800 Subject: net: mscc: ocelot: convert to use ocelot_get_txtstamp() The method getting TX timestamp by reading timestamp FIFO and matching skbs list is common for DSA Felix driver too. So move code out of ocelot_board.c, convert to use ocelot_get_txtstamp() function and export it. Signed-off-by: Yangbo Lu Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2bac4bc34cf6..1a5cb1b2ac5d 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -406,6 +406,13 @@ struct ocelot_ops { int (*reset)(struct ocelot *ocelot); }; +struct ocelot_skb { + struct list_head head; + struct sk_buff *skb; + u8 id; +}; + + struct ocelot_port { struct ocelot *ocelot; @@ -536,6 +543,6 @@ int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts); -void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts); +void ocelot_get_txtstamp(struct ocelot *ocelot); #endif -- cgit v1.2.3 From 400928bf928be153cddd76d9ac4e39978cb43fd3 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 20 Nov 2019 16:23:16 +0800 Subject: net: mscc: ocelot: convert to use ocelot_port_add_txtstamp_skb() Convert to use ocelot_port_add_txtstamp_skb() for adding skbs which require TX timestamp into list. Export it so that DSA Felix driver could reuse it too. Signed-off-by: Yangbo Lu Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 1a5cb1b2ac5d..e1108a5f4f17 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -543,6 +543,8 @@ int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts); +int ocelot_port_add_txtstamp_skb(struct ocelot_port *ocelot_port, + struct sk_buff *skb); void ocelot_get_txtstamp(struct ocelot *ocelot); #endif -- cgit v1.2.3 From 1f8ac5703037fdd2e6c960cd35c2b14d18ef3933 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Nov 2019 13:47:33 +0100 Subject: ipv6: add fib6_has_custom_rules() helper It wraps the namespace field with the same name, to easily access it regardless of build options. Suggested-by: David Ahern Suggested-by: Eric Dumazet Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 5d1615463138..8ac3a59e5126 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -502,6 +502,11 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) } #ifdef CONFIG_IPV6_MULTIPLE_TABLES +static inline bool fib6_has_custom_rules(const struct net *net) +{ + return net->ipv6.fib6_has_custom_rules; +} + int fib6_rules_init(void); void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); @@ -527,6 +532,10 @@ static inline bool fib6_rules_early_flow_dissect(struct net *net, return true; } #else +static inline bool fib6_has_custom_rules(const struct net *net) +{ + return false; +} static inline int fib6_rules_init(void) { return 0; -- cgit v1.2.3 From b9b33e7c24af1cddc7697056f1664279a40d9a4a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Nov 2019 13:47:34 +0100 Subject: ipv6: keep track of routes using src Use a per namespace counter, increment it on successful creation of any route using the source address, decrement it on deletion of such routes. This allows us to check easily if the routing decision in the current namespace depends on the packet source. Will be used by the next patch. Suggested-by: David Ahern Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 30 ++++++++++++++++++++++++++++++ include/net/netns/ipv6.h | 3 +++ 2 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 8ac3a59e5126..f1535f172935 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -90,7 +90,32 @@ struct fib6_gc_args { #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL + +static inline bool fib6_routes_require_src(const struct net *net) +{ + return false; +} + +static inline void fib6_routes_require_src_inc(struct net *net) {} +static inline void fib6_routes_require_src_dec(struct net *net) {} + #else + +static inline bool fib6_routes_require_src(const struct net *net) +{ + return net->ipv6.fib6_routes_require_src > 0; +} + +static inline void fib6_routes_require_src_inc(struct net *net) +{ + net->ipv6.fib6_routes_require_src++; +} + +static inline void fib6_routes_require_src_dec(struct net *net) +{ + net->ipv6.fib6_routes_require_src--; +} + #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif @@ -212,6 +237,11 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) return ((struct rt6_info *)dst)->rt6i_idev; } +static inline bool fib6_requires_src(const struct fib6_info *rt) +{ + return rt->fib6_src.plen > 0; +} + static inline void fib6_clean_expires(struct fib6_info *f6i) { f6i->fib6_flags &= ~RTF_EXPIRES; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 022a0fd1a5a4..5ec054473d81 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -83,6 +83,9 @@ struct netns_ipv6 { #ifdef CONFIG_IPV6_MULTIPLE_TABLES unsigned int fib6_rules_require_fldissect; bool fib6_has_custom_rules; +#ifdef CONFIG_IPV6_SUBTREES + unsigned int fib6_routes_require_src; +#endif struct rt6_info *ip6_prohibit_entry; struct rt6_info *ip6_blk_hole_entry; struct fib6_table *fib6_local_tbl; -- cgit v1.2.3 From c43c3d76c021d8d654ff5cfaad381f14f6beaf1a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Nov 2019 13:47:36 +0100 Subject: ipv4: move fib4_has_custom_rules() helper to public header So that we can use it in the next patch. Additionally constify the helper argument. Suggested-by: David Ahern Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 52b2406a5dfc..b9cba41c6d4f 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -311,6 +311,11 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp, return err; } +static inline bool fib4_has_custom_rules(const struct net *net) +{ + return false; +} + static inline bool fib4_rule_default(const struct fib_rule *rule) { return true; @@ -378,6 +383,11 @@ out: return err; } +static inline bool fib4_has_custom_rules(const struct net *net) +{ + return net->ipv4.fib_has_custom_rules; +} + bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 02b24941619fcce3d280311ac73b1e461552e9c8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Nov 2019 13:47:37 +0100 Subject: ipv4: use dst hint for ipv4 list receive This is alike the previous change, with some additional ipv4 specific quirk. Even when using the route hint we still have to do perform additional per packet checks about source address validity: a new helper is added to wrap them. Hints are explicitly disabled if the destination is a local broadcast, that keeps the code simple and local broadcast are a slower path anyway. UDP flood performances vs recvmmsg() receiver: vanilla patched delta Kpps Kpps % 1683 1871 +11 In the worst case scenario - each packet has a different destination address - the performance delta is within noise range. v3 -> v4: - re-enable hints for forward v2 -> v3: - really fix build (sic) and hint usage check - use fib4_has_custom_rules() helpers (David A.) - add ip_extract_route_hint() helper (Edward C.) - use prev skb as hint instead of copying data (Willem) v1 -> v2: - fix build issue with !CONFIG_IP_MULTIPLE_TABLES Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/route.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index 6c516840380d..a9c60fc68e36 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -185,6 +185,10 @@ int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin, struct fib_result *res); +int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin, + const struct sk_buff *hint); + static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin) { -- cgit v1.2.3 From db3e1c40cf2f973fbdd52ae0b59a9472b1c04f4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 18 Nov 2019 22:06:08 -0800 Subject: mac80211: Import airtime calculation code from mt76 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Felix recently added code to calculate airtime of packets to the mt76 driver. Import this into mac80211 so we can use it for airtime queue limit calculations. The airtime.c file is copied verbatim from the mt76 driver, and adjusted to be usable in mac80211. This involves: - Switching to mac80211 data structures. - Adding support for 160 MHz channels and HE mode. - Moving the symbol and duration calculations around a bit to avoid rounding with the higher rates and longer symbol times used for HE rates. The per-rate TX rate calculation is also split out to its own function so it can be used directly for the AQL calculations later. Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20191119060610.76681-3-kyan@google.com [fix HE_GROUP_IDX() to use 3 * bw, since there are 3 _gi values] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c643a19dce96..6fc26a051ba0 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6424,4 +6424,33 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif, struct cfg80211_nan_match_params *match, gfp_t gfp); +/** + * ieee80211_calc_rx_airtime - calculate estimated transmission airtime for RX. + * + * This function calculates the estimated airtime usage of a frame based on the + * rate information in the RX status struct and the frame length. + * + * @hw: pointer as obtained from ieee80211_alloc_hw() + * @status: &struct ieee80211_rx_status containing the transmission rate + * information. + * @len: frame length in bytes + */ +u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw, + struct ieee80211_rx_status *status, + int len); + +/** + * ieee80211_calc_tx_airtime - calculate estimated transmission airtime for TX. + * + * This function calculates the estimated airtime usage of a frame based on the + * rate information in the TX info struct and the frame length. + * + * @hw: pointer as obtained from ieee80211_alloc_hw() + * @info: &struct ieee80211_tx_info of the frame. + * @len: frame length in bytes + */ +u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, + struct ieee80211_tx_info *info, + int len); + #endif /* MAC80211_H */ -- cgit v1.2.3 From 3ace10f5b5ad94bdbd4b419dc9da2217d57720a9 Mon Sep 17 00:00:00 2001 From: Kan Yan Date: Mon, 18 Nov 2019 22:06:09 -0800 Subject: mac80211: Implement Airtime-based Queue Limit (AQL) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order for the Fq_CoDel algorithm integrated in mac80211 layer to operate effectively to control excessive queueing latency, the CoDel algorithm requires an accurate measure of how long packets stays in the queue, AKA sojourn time. The sojourn time measured at the mac80211 layer doesn't include queueing latency in the lower layer (firmware/hardware) and CoDel expects lower layer to have a short queue. However, most 802.11ac chipsets offload tasks such TX aggregation to firmware or hardware, thus have a deep lower layer queue. Without a mechanism to control the lower layer queue size, packets only stay in mac80211 layer transiently before being sent to firmware queue. As a result, the sojourn time measured by CoDel in the mac80211 layer is almost always lower than the CoDel latency target, hence CoDel does little to control the latency, even when the lower layer queue causes excessive latency. The Byte Queue Limits (BQL) mechanism is commonly used to address the similar issue with wired network interface. However, this method cannot be applied directly to the wireless network interface. "Bytes" is not a suitable measure of queue depth in the wireless network, as the data rate can vary dramatically from station to station in the same network, from a few Mbps to over Gbps. This patch implements an Airtime-based Queue Limit (AQL) to make CoDel work effectively with wireless drivers that utilized firmware/hardware offloading. AQL allows each txq to release just enough packets to the lower layer to form 1-2 large aggregations to keep hardware fully utilized and retains the rest of the frames in mac80211 layer to be controlled by the CoDel algorithm. Signed-off-by: Kan Yan [ Toke: Keep API to set pending airtime internal, fix nits in commit msg ] Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20191119060610.76681-4-kyan@google.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 +++++++ include/net/mac80211.h | 12 ++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5ded77fad7fb..059524b87c4c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2606,6 +2606,13 @@ enum wiphy_params_flags { #define IEEE80211_DEFAULT_AIRTIME_WEIGHT 256 +/* The per TXQ device queue limit in airtime */ +#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L 5000 +#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H 12000 + +/* The per interface airtime threshold to switch to lower queue limit */ +#define IEEE80211_AQL_THRESHOLD 24000 + /** * struct cfg80211_pmksa - PMK Security Association * diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 6fc26a051ba0..ba3f33cc41ea 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5565,6 +5565,18 @@ void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid); void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime); +/** + * ieee80211_txq_airtime_check - check if a txq can send frame to device + * + * @hw: pointer obtained from ieee80211_alloc_hw() + * @txq: pointer obtained from station or virtual interface + * + * Return true if the AQL's airtime limit has not been reached and the txq can + * continue to send more packets to the device. Otherwise return false. + */ +bool +ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); + /** * ieee80211_iter_keys - iterate keys programmed into the device * @hw: pointer obtained from ieee80211_alloc_hw() -- cgit v1.2.3 From 7a89233ac50468a3a9636803a85d06c8f907f8ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 18 Nov 2019 22:06:10 -0800 Subject: mac80211: Use Airtime-based Queue Limits (AQL) on packet dequeue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit added the ability to throttle stations when they queue too much airtime in the hardware. This commit enables the functionality by calculating the expected airtime usage of each packet that is dequeued from the TXQs in mac80211, and accounting that as pending airtime. The estimated airtime for each skb is stored in the tx_info, so we can subtract the same amount from the running total when the skb is freed or recycled. The throttling mechanism relies on this accounting to be accurate (i.e., that we are not freeing skbs without subtracting any airtime they were accounted for), so we put the subtraction into ieee80211_report_used_skb(). As an optimisation, we also subtract the airtime on regular TX completion, zeroing out the value stored in the packet afterwards, to avoid having to do an expensive lookup of the station from the packet data on every packet. This patch does *not* include any mechanism to wake a throttled TXQ again, on the assumption that this will happen anyway as a side effect of whatever freed the skb (most commonly a TX completion). Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20191119060610.76681-5-kyan@google.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ba3f33cc41ea..aa145808e57a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1060,6 +1060,22 @@ struct ieee80211_tx_info { }; }; +static inline u16 +ieee80211_info_set_tx_time_est(struct ieee80211_tx_info *info, u16 tx_time_est) +{ + /* We only have 10 bits in tx_time_est, so store airtime + * in increments of 4us and clamp the maximum to 2**12-1 + */ + info->tx_time_est = min_t(u16, tx_time_est, 4095) >> 2; + return info->tx_time_est << 2; +} + +static inline u16 +ieee80211_info_get_tx_time_est(struct ieee80211_tx_info *info) +{ + return info->tx_time_est << 2; +} + /** * struct ieee80211_tx_status - extended tx status info for rate control * -- cgit v1.2.3 From 677bf08cfdf9ee411c2084157f15d85edb09a81a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 Nov 2019 06:56:23 +0100 Subject: udp: drop skb extensions before marking skb stateless Once udp stack has set the UDP_SKB_IS_STATELESS flag, later skb free assumes all skb head state has been dropped already. This will leak the extension memory in case the skb has extensions other than the ipsec secpath, e.g. bridge nf data. To fix this, set the UDP_SKB_IS_STATELESS flag only if we don't have extensions or if the extension space can be free'd. Fixes: 895b5c9f206eb7d25dc1360a ("netfilter: drop bridge nf reset from nf_reset") Cc: Paolo Abeni Reported-by: Byron Stanoszek Signed-off-by: Florian Westphal Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 64a395c7f689..8688f7adfda7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4169,12 +4169,18 @@ static inline void skb_ext_reset(struct sk_buff *skb) skb->active_extensions = 0; } } + +static inline bool skb_has_extensions(struct sk_buff *skb) +{ + return unlikely(skb->active_extensions); +} #else static inline void skb_ext_put(struct sk_buff *skb) {} static inline void skb_ext_reset(struct sk_buff *skb) {} static inline void skb_ext_del(struct sk_buff *skb, int unused) {} static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} +static inline bool skb_has_extensions(struct sk_buff *skb) { return false; } #endif /* CONFIG_SKB_EXTENSIONS */ static inline void nf_reset_ct(struct sk_buff *skb) -- cgit v1.2.3 From d1746d1e80a86ca86b0c2680510898d411d2ef47 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 22 Nov 2019 15:47:21 +0000 Subject: net: flow_dissector: Wrap unionized VLAN fields in a struct In commit a82055af5959 ("netfilter: nft_payload: add VLAN offload support"), VLAN fields in struct flow_dissector_key_vlan were unionized with the intention of introducing another field that covered the whole TCI header. However without a wrapping struct the subfields end up sharing the same bits. As a result, "tc filter add ... flower vlan_id 14" specifies not only vlan_id, but also vlan_priority. Fix by wrapping the individual VLAN fields in a struct. Fixes: a82055af5959 ("netfilter: nft_payload: add VLAN offload support") Signed-off-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index f06b0239c32b..b8c20e9f343e 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -49,9 +49,11 @@ struct flow_dissector_key_tags { struct flow_dissector_key_vlan { union { - u16 vlan_id:12, - vlan_dei:1, - vlan_priority:3; + struct { + u16 vlan_id:12, + vlan_dei:1, + vlan_priority:3; + }; __be16 vlan_tci; }; __be16 vlan_tpid; -- cgit v1.2.3 From a18fab48dbacbb7ff104a13e987778b7995bec07 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 28 Oct 2019 16:58:53 +0200 Subject: net/mlx5: DR, Add HW bits and definitions for Geneve flex parser Add definition for flex parser tunneling header for Geneve. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4f912d4e67bc..5d54fccf87fc 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1110,6 +1110,7 @@ enum { }; enum { + MLX5_FLEX_PARSER_GENEVE_ENABLED = 1 << 3, MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED = 1 << 7, MLX5_FLEX_PARSER_ICMP_V4_ENABLED = 1 << 8, MLX5_FLEX_PARSER_ICMP_V6_ENABLED = 1 << 9, -- cgit v1.2.3 From 30429fba99b51836ea8a11174be95ddaa8c47703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Fri, 22 Nov 2019 13:50:52 -0800 Subject: net: inet_is_local_reserved_port() should return bool not int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Eric Dumazet Signed-off-by: Maciej Żenczykowski Signed-off-by: Jakub Kicinski --- include/net/ip.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index a2c61c36dc4a..cebf3e10def1 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -339,10 +339,10 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o void inet_get_local_port_range(struct net *net, int *low, int *high); #ifdef CONFIG_SYSCTL -static inline int inet_is_local_reserved_port(struct net *net, int port) +static inline bool inet_is_local_reserved_port(struct net *net, int port) { if (!net->ipv4.sysctl_local_reserved_ports) - return 0; + return false; return test_bit(port, net->ipv4.sysctl_local_reserved_ports); } @@ -357,9 +357,9 @@ static inline int inet_prot_sock(struct net *net) } #else -static inline int inet_is_local_reserved_port(struct net *net, int port) +static inline bool inet_is_local_reserved_port(struct net *net, int port) { - return 0; + return false; } static inline int inet_prot_sock(struct net *net) -- cgit v1.2.3 From 84bb46cd62283cc371769ec1f77ff7924099f584 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 23 Nov 2019 09:54:58 -0800 Subject: Revert "bpf: Emit audit messages upon successful prog load and unload" This commit reverts commit 91e6015b082b ("bpf: Emit audit messages upon successful prog load and unload") and its follow up commit 7599a896f2e4 ("audit: Move audit_log_task declaration under CONFIG_AUDITSYSCALL") as requested by Paul Moore. The change needs close review on linux-audit, tests etc. Signed-off-by: Jakub Kicinski --- include/linux/audit.h | 5 ----- include/uapi/linux/audit.h | 1 - 2 files changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/audit.h b/include/linux/audit.h index 18925d924c73..aee3dc9eb378 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -358,8 +358,6 @@ static inline void audit_ptrace(struct task_struct *t) __audit_ptrace(t); } -extern void audit_log_task(struct audit_buffer *ab); - /* Private API (for audit.c only) */ extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); @@ -647,9 +645,6 @@ static inline void audit_ntp_log(const struct audit_ntp_data *ad) static inline void audit_ptrace(struct task_struct *t) { } - -static inline void audit_log_task(struct audit_buffer *ab) -{ } #define audit_n_rules 0 #define audit_signals 0 #endif /* CONFIG_AUDITSYSCALL */ diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 32a5db900f47..c89c6495983d 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -116,7 +116,6 @@ #define AUDIT_FANOTIFY 1331 /* Fanotify access decision */ #define AUDIT_TIME_INJOFFSET 1332 /* Timekeeping offset injected */ #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */ -#define AUDIT_BPF 1334 /* BPF subsystem */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ -- cgit v1.2.3 From e3cf8b3668a808c1d252269ffc34a5723cfb9a7b Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 22 Nov 2019 12:37:08 +0000 Subject: net: phy: remove phy_ethtool_sset() There are no users of phy_ethtool_sset() in the kernel anymore, and as of commit 3c1bcc8614db ("net: ethernet: Convert phydev advertize and supported from u32 to link mode"), the implementation is slightly buggy - it doesn't correctly check the masked advertising mask as it used to. Remove it, and update the phy documentation to refer to its replacement function. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 124516fe2763..f5cdfb206097 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1160,7 +1160,6 @@ void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies); void phy_mac_interrupt(struct phy_device *phydev); void phy_start_machine(struct phy_device *phydev); void phy_stop_machine(struct phy_device *phydev); -int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd); void phy_ethtool_ksettings_get(struct phy_device *phydev, struct ethtool_link_ksettings *cmd); int phy_ethtool_ksettings_set(struct phy_device *phydev, -- cgit v1.2.3 From d46b7e4fb06037a61415f5b6964fcf632ee1dc34 Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 21 Nov 2019 00:36:22 +0000 Subject: net: phylink: rename mac_link_state() op to mac_pcs_get_state() Rename the mac_link_state() method to mac_pcs_get_state() to make it clear that it should be returning the MACs PCS current state, which is used for inband negotiation rather than just reading back what the MAC has been configured for. Update the documentation to explicitly mention that this is for inband. We drop the return value as well; most of phylink doesn't check the return value and it is not clear what it should do on error - instead arrange for state->link to be false. Signed-off-by: Russell King Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 300ecdb6790a..fed5488e3c75 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -72,7 +72,7 @@ struct phylink_config { /** * struct phylink_mac_ops - MAC operations structure. * @validate: Validate and update the link configuration. - * @mac_link_state: Read the current link state from the hardware. + * @mac_pcs_get_state: Read the current link state from the hardware. * @mac_config: configure the MAC for the selected mode and state. * @mac_an_restart: restart 802.3z BaseX autonegotiation. * @mac_link_down: take the link down. @@ -84,8 +84,8 @@ struct phylink_mac_ops { void (*validate)(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); - int (*mac_link_state)(struct phylink_config *config, - struct phylink_link_state *state); + void (*mac_pcs_get_state)(struct phylink_config *config, + struct phylink_link_state *state); void (*mac_config)(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state); void (*mac_an_restart)(struct phylink_config *config); @@ -127,18 +127,19 @@ void validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); /** - * mac_link_state() - Read the current link state from the hardware + * mac_pcs_get_state() - Read the current inband link state from the hardware * @config: a pointer to a &struct phylink_config. * @state: a pointer to a &struct phylink_link_state. * - * Read the current link state from the MAC, reporting the current - * speed in @state->speed, duplex mode in @state->duplex, pause mode - * in @state->pause using the %MLO_PAUSE_RX and %MLO_PAUSE_TX bits, - * negotiation completion state in @state->an_complete, and link - * up state in @state->link. + * Read the current inband link state from the MAC PCS, reporting the + * current speed in @state->speed, duplex mode in @state->duplex, pause + * mode in @state->pause using the %MLO_PAUSE_RX and %MLO_PAUSE_TX bits, + * negotiation completion state in @state->an_complete, and link up state + * in @state->link. If possible, @state->lp_advertising should also be + * populated. */ -int mac_link_state(struct phylink_config *config, - struct phylink_link_state *state); +void mac_pcs_get_state(struct phylink_config *config, + struct phylink_link_state *state); /** * mac_config() - configure the MAC for the selected mode and state @@ -166,7 +167,7 @@ int mac_link_state(struct phylink_config *config, * 1000base-X or Cisco SGMII mode depending on the @state->interface * mode). In both cases, link state management (whether the link * is up or not) is performed by the MAC, and reported via the - * mac_link_state() callback. Changes in link state must be made + * mac_pcs_get_state() callback. Changes in link state must be made * by calling phylink_mac_change(). * * If in 802.3z mode, the link speed is fixed, dependent on the -- cgit v1.2.3 From 312434617cb16be5166316cf9d08ba760b1042a1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 23 Nov 2019 11:56:49 +0800 Subject: sctp: cache netns in sctp_ep_common This patch is to fix a data-race reported by syzbot: BUG: KCSAN: data-race in sctp_assoc_migrate / sctp_hash_obj write to 0xffff8880b67c0020 of 8 bytes by task 18908 on cpu 1: sctp_assoc_migrate+0x1a6/0x290 net/sctp/associola.c:1091 sctp_sock_migrate+0x8aa/0x9b0 net/sctp/socket.c:9465 sctp_accept+0x3c8/0x470 net/sctp/socket.c:4916 inet_accept+0x7f/0x360 net/ipv4/af_inet.c:734 __sys_accept4+0x224/0x430 net/socket.c:1754 __do_sys_accept net/socket.c:1795 [inline] __se_sys_accept net/socket.c:1792 [inline] __x64_sys_accept+0x4e/0x60 net/socket.c:1792 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffff8880b67c0020 of 8 bytes by task 12003 on cpu 0: sctp_hash_obj+0x4f/0x2d0 net/sctp/input.c:894 rht_key_get_hash include/linux/rhashtable.h:133 [inline] rht_key_hashfn include/linux/rhashtable.h:159 [inline] rht_head_hashfn include/linux/rhashtable.h:174 [inline] head_hashfn lib/rhashtable.c:41 [inline] rhashtable_rehash_one lib/rhashtable.c:245 [inline] rhashtable_rehash_chain lib/rhashtable.c:276 [inline] rhashtable_rehash_table lib/rhashtable.c:316 [inline] rht_deferred_worker+0x468/0xab0 lib/rhashtable.c:420 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 It was caused by rhashtable access asoc->base.sk when sctp_assoc_migrate is changing its value. However, what rhashtable wants is netns from asoc base.sk, and for an asoc, its netns won't change once set. So we can simply fix it by caching netns since created. Fixes: d6c0256a60e6 ("sctp: add the rhashtable apis for sctp global transport hashtable") Reported-by: syzbot+e3b35fe7918ff0ee474e@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski --- include/net/sctp/structs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 503fbc3cd819..2b6f3f13d5bc 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1239,6 +1239,9 @@ struct sctp_ep_common { /* What socket does this endpoint belong to? */ struct sock *sk; + /* Cache netns and it won't change once set */ + struct net *net; + /* This is where we receive inbound chunks. */ struct sctp_inq inqueue; -- cgit v1.2.3 From 4b3da77b72ad6b3c48c6fe4a395ace7db39a12c5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:54 +0100 Subject: bpf, x86: Generalize and extend bpf_arch_text_poke for direct jumps Add BPF_MOD_{NOP_TO_JUMP,JUMP_TO_JUMP,JUMP_TO_NOP} patching for x86 JIT in order to be able to patch direct jumps or nop them out. We need this facility in order to patch tail call jumps and in later work also BPF static keys. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/aa4784196a8e5e985af4b30a4fe5336bce6e9643.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e89e86122233..7978b617caa8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1284,10 +1284,16 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, #endif /* CONFIG_INET */ enum bpf_text_poke_type { + /* All call-related pokes. */ BPF_MOD_NOP_TO_CALL, BPF_MOD_CALL_TO_CALL, BPF_MOD_CALL_TO_NOP, + /* All jump-related pokes. */ + BPF_MOD_NOP_TO_JUMP, + BPF_MOD_JUMP_TO_JUMP, + BPF_MOD_JUMP_TO_NOP, }; + int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); -- cgit v1.2.3 From 6332be04c039a72fca32ed0a4265bac58d606bb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:55 +0100 Subject: bpf: Move bpf_free_used_maps into sleepable section We later on are going to need a sleepable context as opposed to plain RCU callback in order to untrack programs we need to poke at runtime and tracking as well as image update is performed under mutex. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/09823b1d5262876e9b83a8e75df04cf0467357a4.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7978b617caa8..561b920f0bf7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1031,6 +1031,10 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, { return -ENOTSUPP; } + +static inline void bpf_map_put(struct bpf_map *map) +{ +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, -- cgit v1.2.3 From 2beee5f57441413b64a9c2bd657e17beabb98d1c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:56 +0100 Subject: bpf: Move owner type, jited info into array auxiliary data We're going to extend this with further information which is only relevant for prog array at this point. Given this info is not used in critical path, move it into its own structure such that the main array map structure can be kept on diet. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/b9ddccdb0f6f7026489ee955f16c96381e1e7238.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 561b920f0bf7..c3b29061284e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -560,17 +560,21 @@ struct bpf_prog_aux { }; }; +struct bpf_array_aux { + /* 'Ownership' of prog array is claimed by the first program that + * is going to use this map or by the first program which FD is + * stored in the map to make sure that all callers and callees have + * the same prog type and JITed flag. + */ + enum bpf_prog_type type; + bool jited; +}; + struct bpf_array { struct bpf_map map; u32 elem_size; u32 index_mask; - /* 'ownership' of prog_array is claimed by the first program that - * is going to use this map or by the first program which FD is stored - * in the map to make sure that all callers and callees have the same - * prog_type and JITed flag - */ - enum bpf_prog_type owner_prog_type; - bool owner_jited; + struct bpf_array_aux *aux; union { char value[0] __aligned(8); void *ptrs[0] __aligned(8); -- cgit v1.2.3 From a66886fe6c24ebeeb6dc10fbd9b75158029eacf7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:57 +0100 Subject: bpf: Add initial poke descriptor table for jit images Add initial poke table data structures and management to the BPF prog that can later be used by JITs. Also add an instance of poke specific data for tail call maps; plan for later work is to extend this also for BPF static keys. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1db285ec2ea4207ee0455b3f8e191a4fc58b9ade.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 20 ++++++++++++++++++++ include/linux/filter.h | 10 ++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c3b29061284e..312983bf7faa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,6 +488,24 @@ struct bpf_func_info_aux { bool unreliable; }; +enum bpf_jit_poke_reason { + BPF_POKE_REASON_TAIL_CALL, +}; + +/* Descriptor of pokes pointing /into/ the JITed image. */ +struct bpf_jit_poke_descriptor { + void *ip; + union { + struct { + struct bpf_map *map; + u32 key; + } tail_call; + }; + bool ip_stable; + u8 adj_off; + u16 reason; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -513,6 +531,8 @@ struct bpf_prog_aux { const char *attach_func_name; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ + struct bpf_jit_poke_descriptor *poke_tab; + u32 size_poke_tab; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_prog_ops *ops; diff --git a/include/linux/filter.h b/include/linux/filter.h index ad80e9c6111c..796b60d8cc6c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -952,6 +952,9 @@ void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke); + int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed); @@ -1055,6 +1058,13 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) return false; } +static inline int +bpf_jit_add_poke_descriptor(struct bpf_prog *prog, + struct bpf_jit_poke_descriptor *poke) +{ + return -ENOTSUPP; +} + static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); -- cgit v1.2.3 From da765a2f599304a81a25e77908d1790414ecdbb6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:58 +0100 Subject: bpf: Add poke dependency tracking for prog array maps This work adds program tracking to prog array maps. This is needed such that upon prog array updates/deletions we can fix up all programs which make use of this tail call map. We add ops->map_poke_{un,}track() helpers to maps to maintain the list of programs and ops->map_poke_run() for triggering the actual update. bpf_array_aux is extended to contain the list head and poke_mutex in order to serialize program patching during updates/deletions. bpf_free_used_maps() will untrack the program shortly before dropping the reference to the map. For clearing out the prog array once all urefs are dropped we need to use schedule_work() to have a sleepable context. The prog_array_map_poke_run() is triggered during updates/deletions and walks the maintained prog list. It checks in their poke_tabs whether the map and key is matching and runs the actual bpf_arch_text_poke() for patching in the nop or new jmp location. Depending on the type of update, we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1fb364bb3c565b3e415d5ea348f036ff379e779d.1574452833.git.daniel@iogearbox.net --- include/linux/bpf.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 312983bf7faa..c2f07fd410c1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -22,6 +22,7 @@ struct bpf_verifier_env; struct bpf_verifier_log; struct perf_event; struct bpf_prog; +struct bpf_prog_aux; struct bpf_map; struct sock; struct seq_file; @@ -64,6 +65,12 @@ struct bpf_map_ops { const struct btf_type *key_type, const struct btf_type *value_type); + /* Prog poke tracking helpers. */ + int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux); + void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux); + void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old, + struct bpf_prog *new); + /* Direct value access helpers. */ int (*map_direct_value_addr)(const struct bpf_map *map, u64 *imm, u32 off); @@ -588,6 +595,11 @@ struct bpf_array_aux { */ enum bpf_prog_type type; bool jited; + /* Programs with direct jumps into programs part of this array. */ + struct list_head poke_progs; + struct bpf_map *map; + struct mutex poke_mutex; + struct work_struct work; }; struct bpf_array { -- cgit v1.2.3 From d2e4c1e6c2947269346054ac8937ccfe9e0bcc6b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Nov 2019 21:07:59 +0100 Subject: bpf: Constant map key tracking for prog array pokes Add tracking of constant keys into tail call maps. The signature of bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3 is a index key. The direct call approach for tail calls can be enabled if the verifier asserted that for all branches leading to the tail call helper invocation, the map pointer and index key were both constant and the same. Tracking of map pointers we already do from prior work via c93552c443eb ("bpf: properly enforce index mask to prevent out-of-bounds speculation") and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/ delete calls on maps"). Given the tail call map index key is not on stack but directly in the register, we can add similar tracking approach and later in fixup_bpf_calls() add a poke descriptor to the progs poke_tab with the relevant information for the JITing phase. We internally reuse insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL instruction in order to point into the prog's poke_tab, and keep insn->imm as 0 as indicator that current indirect tail call emission must be used. Note that publishing to the tracker must happen at the end of fixup_bpf_calls() since adding elements to the poke_tab reallocates its memory, so we need to wait until its in final state. Future work can generalize and add similar approach to optimize plain array map lookups. Difference there is that we need to look into the key value that sits on stack. For clarity in bpf_insn_aux_data, map_state has been renamed into map_ptr_state, so we get map_{ptr,key}_state as trackers. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/e8db37f6b2ae60402fa40216c96738ee9b316c32.1574452833.git.daniel@iogearbox.net --- include/linux/bpf_verifier.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cdd08bf0ec06..26e40de9ef55 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -293,7 +293,7 @@ struct bpf_verifier_state_list { struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ - unsigned long map_state; /* pointer/poison value for maps */ + unsigned long map_ptr_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ struct { @@ -301,6 +301,7 @@ struct bpf_insn_aux_data { u32 map_off; /* offset from value base address */ }; }; + u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ -- cgit v1.2.3 From b8cd76ca4ae34731d47cd6a876d912a08efcc240 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 23 Nov 2019 21:37:31 +0100 Subject: bpf: Add bpf_jit_blinding_enabled for !CONFIG_BPF_JIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a definition of bpf_jit_blinding_enabled() when CONFIG_BPF_JIT is not set in order to fix a recent build regression: [...] CC kernel/bpf/verifier.o CC kernel/bpf/inode.o kernel/bpf/verifier.c: In function ‘fixup_bpf_calls’: kernel/bpf/verifier.c:9132:25: error: implicit declaration of function ‘bpf_jit_blinding_enabled’; did you mean ‘bpf_jit_kallsyms_enabled’? [-Werror=implicit-function-declaration] 9132 | bool expect_blinding = bpf_jit_blinding_enabled(prog); | ^~~~~~~~~~~~~~~~~~~~~~~~ | bpf_jit_kallsyms_enabled CC kernel/bpf/helpers.o CC kernel/bpf/hashtab.o [...] Fixes: d2e4c1e6c294 ("bpf: Constant map key tracking for prog array pokes") Reported-by: Jakub Sitnicki Reported-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/40baf8f3507cac4851a310578edfb98ce73b5605.1574541375.git.daniel@iogearbox.net --- include/linux/filter.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 796b60d8cc6c..1b1e8b8f88da 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1053,6 +1053,11 @@ static inline bool ebpf_jit_enabled(void) return false; } +static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) +{ + return false; +} + static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return false; -- cgit v1.2.3 From b553a6ec570044fc1ae300c6fb24f9ce204c5894 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 24 Nov 2019 01:39:42 +0100 Subject: bpf: Simplify __bpf_arch_text_poke poke type handling Given that we have BPF_MOD_NOP_TO_{CALL,JUMP}, BPF_MOD_{CALL,JUMP}_TO_NOP and BPF_MOD_{CALL,JUMP}_TO_{CALL,JUMP} poke types and that we also pass in old_addr as well as new_addr, it's a bit redundant and unnecessarily complicates __bpf_arch_text_poke() itself since we can derive the same from the *_addr that were passed in. Hence simplify and use BPF_MOD_{CALL,JUMP} as types which also allows to clean up call-sites. In addition to that, __bpf_arch_text_poke() currently verifies that text matches expected old_insn before we invoke text_poke_bp(). Also add a check on new_insn and skip rewrite if it already matches. Reason why this is rather useful is that it avoids making any special casing in prog_array_map_poke_run() when old and new prog were NULL and has the benefit that also for this case we perform a check on text whether it really matches our expectations. Suggested-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/fcb00a2b0b288d6c73de4ef58116a821c8fe8f2f.1574555798.git.daniel@iogearbox.net --- include/linux/bpf.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c2f07fd410c1..35903f148be5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1324,14 +1324,8 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, #endif /* CONFIG_INET */ enum bpf_text_poke_type { - /* All call-related pokes. */ - BPF_MOD_NOP_TO_CALL, - BPF_MOD_CALL_TO_CALL, - BPF_MOD_CALL_TO_NOP, - /* All jump-related pokes. */ - BPF_MOD_NOP_TO_JUMP, - BPF_MOD_JUMP_TO_JUMP, - BPF_MOD_JUMP_TO_NOP, + BPF_MOD_CALL, + BPF_MOD_JUMP, }; int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, -- cgit v1.2.3 From 40363cf13999ee4fb3b5c1e67fa5e6f0e9da34bd Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 14 Nov 2019 12:17:41 -0500 Subject: writeback: fix -Wformat compilation warnings The commit f05499a06fb4 ("writeback: use ino_t for inodes in tracepoints") introduced a lot of GCC compilation warnings on s390, In file included from ./include/trace/define_trace.h:102, from ./include/trace/events/writeback.h:904, from fs/fs-writeback.c:82: ./include/trace/events/writeback.h: In function 'trace_raw_output_writeback_page_template': ./include/trace/events/writeback.h:76:12: warning: format '%lu' expects argument of type 'long unsigned int', but argument 4 has type 'ino_t' {aka 'unsigned int'} [-Wformat=] TP_printk("bdi %s: ino=%lu index=%lu", ^~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/trace/trace_events.h:360:22: note: in definition of macro 'DECLARE_EVENT_CLASS' trace_seq_printf(s, print); \ ^~~~~ ./include/trace/events/writeback.h:76:2: note: in expansion of macro 'TP_printk' TP_printk("bdi %s: ino=%lu index=%lu", ^~~~~~~~~ Fix them by adding necessary casts where ino_t could be either "unsigned int" or "unsigned long". Fixes: f05499a06fb4 ("writeback: use ino_t for inodes in tracepoints") Signed-off-by: Qian Cai Signed-off-by: Tejun Heo --- include/trace/events/writeback.h | 48 ++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index b4f0ffe1817e..ef50be4e5e6c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -75,7 +75,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, __entry->index ) ); @@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) @@ -201,8 +201,8 @@ TRACE_EVENT(inode_foreign_history, TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, - __entry->ino, - __entry->cgroup_ino, + (unsigned long)__entry->ino, + (unsigned long)__entry->cgroup_ino, __entry->history ) ); @@ -230,9 +230,9 @@ TRACE_EVENT(inode_switch_wbs, TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, - __entry->ino, - __entry->old_cgroup_ino, - __entry->new_cgroup_ino + (unsigned long)__entry->ino, + (unsigned long)__entry->old_cgroup_ino, + (unsigned long)__entry->new_cgroup_ino ) ); @@ -266,10 +266,10 @@ TRACE_EVENT(track_foreign_dirty, TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, - __entry->ino, + (unsigned long)__entry->ino, __entry->memcg_id, - __entry->cgroup_ino, - __entry->page_cgroup_ino + (unsigned long)__entry->cgroup_ino, + (unsigned long)__entry->page_cgroup_ino ) ); @@ -296,7 +296,7 @@ TRACE_EVENT(flush_foreign, TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, - __entry->cgroup_ino, + (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) @@ -326,9 +326,9 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, __entry->sync_mode, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -383,7 +383,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ @@ -421,7 +421,7 @@ DECLARE_EVENT_CLASS(writeback_class, ), TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ @@ -489,7 +489,7 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->range_cyclic, __entry->range_start, __entry->range_end, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ) @@ -528,7 +528,7 @@ TRACE_EVENT(writeback_queue_io, __entry->age, /* older_than_this in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -622,7 +622,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -707,7 +707,7 @@ TRACE_EVENT(balance_dirty_pages, __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -735,11 +735,11 @@ TRACE_EVENT(writeback_sb_inodes_requeue, TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -813,14 +813,14 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -861,7 +861,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, __entry->dirtied_when, + (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); -- cgit v1.2.3 From 0be0ee71816b2b6725e2b4f32ad6726c9d729777 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 11 Nov 2019 15:51:03 -0800 Subject: vfs: properly and reliably lock f_pos in fdget_pos() fdget_pos() is used by file operations that will read and update f_pos: things like "read()", "write()" and "lseek()" (but not, for example, "pread()/pwrite" that get their file positions elsewhere). However, it had two separate escape clauses for this, because not everybody wants or needs serialization of the file position. The first and most obvious case is the "file descriptor doesn't have a position at all", ie a stream-like file. Except we didn't actually use FMODE_STREAM, but instead used FMODE_ATOMIC_POS. The reason for that was that FMODE_STREAM didn't exist back in the days, but also that we didn't want to mark all the special cases, so we only marked the ones that _required_ position atomicity according to POSIX - regular files and directories. The case one was intentionally lazy, but now that we _do_ have FMODE_STREAM we could and should just use it. With the change to use FMODE_STREAM, there are no remaining uses for FMODE_ATOMIC_POS, and all the code to set it is deleted. Any cases where we don't want the serialization because the driver (or subsystem) doesn't use the file position should just be updated to do "stream_open()". We've done that for all the obvious and common situations, we may need a few more. Quoting Kirill Smelkov in the original FMODE_STREAM thread (see link below for full email): "And I appreciate if people could help at least somehow with "getting rid of mixed case entirely" (i.e. always lock f_pos_lock on !FMODE_STREAM), because this transition starts to diverge from my particular use-case too far. To me it makes sense to do that transition as follows: - convert nonseekable_open -> stream_open via stream_open.cocci; - audit other nonseekable_open calls and convert left users that truly don't depend on position to stream_open; - extend stream_open.cocci to analyze alloc_file_pseudo as well (this will cover pipes and sockets), or maybe convert pipes and sockets to FMODE_STREAM manually; - extend stream_open.cocci to analyze file_operations that use no_llseek or noop_llseek, but do not use nonseekable_open or alloc_file_pseudo. This might find files that have stream semantic but are opened differently; - extend stream_open.cocci to analyze file_operations whose .read/.write do not use ppos at all (independently of how file was opened); - ... - after that remove FMODE_ATOMIC_POS and always take f_pos_lock if !FMODE_STREAM; - gather bug reports for deadlocked read/write and convert missed cases to FMODE_STREAM, probably extending stream_open.cocci along the road to catch similar cases i.e. always take f_pos_lock unless a file is explicitly marked as being stream, and try to find and cover all files that are streams" We have not done the "extend stream_open.cocci to analyze alloc_file_pseudo" as well, but the previous commit did manually handle the case of pipes and sockets. The other case where we can avoid locking f_pos is the "this file descriptor only has a single user and it is us, and thus there is no need to lock it". The second test was correct, although a bit subtle and worth just re-iterating here. There are two kinds of other sources of references to the same file descriptor: file descriptors that have been explicitly shared across fork() or with dup(), and file tables having elevated reference counts due to threading (or explicit file sharing with clone()). The first case would have incremented the file count explicitly, and in the second case the previous __fdget() would have incremented it for us and set the FDPUT_FPUT flag. But in both cases the file count would be greater than one, so the "file_count(file) > 1" test catches both situations. Also note that if file_count is 1, that also means that no other thread can have access to the file table, so there also cannot be races with concurrent calls to dup()/fork()/clone() that would increment the file count any other way. Link: https://lore.kernel.org/linux-fsdevel/20190413184404.GA13490@deco.navytux.spb.ru Cc: Kirill Smelkov Cc: Eic Dumazet Cc: Al Viro Cc: Alan Stern Cc: Marco Elver Cc: Andrea Parri Cc: Paul McKenney Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index e0d909d35763..a7c3f6dd5701 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -148,8 +148,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)0x4000) -/* File needs atomic accesses to f_pos */ -#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000) /* Write access to underlying fs */ #define FMODE_WRITER ((__force fmode_t)0x10000) /* Has read method(s) */ -- cgit v1.2.3 From bec170e55982c2d3b8e1beccadf16e288fe6fb5a Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 23 Nov 2019 17:28:37 +0100 Subject: net: phy: add helpers phy_(un)lock_mdio_bus Add helpers to make locking/unlocking the MDIO bus easier. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/phy.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index f5cdfb206097..5032d453ac66 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1076,6 +1076,16 @@ static inline const char *phydev_name(const struct phy_device *phydev) return dev_name(&phydev->mdio.dev); } +static inline void phy_lock_mdio_bus(struct phy_device *phydev) +{ + mutex_lock(&phydev->mdio.bus->mdio_lock); +} + +static inline void phy_unlock_mdio_bus(struct phy_device *phydev) +{ + mutex_unlock(&phydev->mdio.bus->mdio_lock); +} + void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) __printf(2, 3); void phy_attached_info(struct phy_device *phydev); -- cgit v1.2.3