From 04f08eb44b5011493d77b602fdec29ff0f5c6cd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Sep 2021 17:00:29 -0700 Subject: net/af_unix: fix a data-race in unix_dgram_poll syzbot reported another data-race in af_unix [1] Lets change __skb_insert() to use WRITE_ONCE() when changing skb head qlen. Also, change unix_dgram_poll() to use lockless version of unix_recvq_full() It is verry possible we can switch all/most unix_recvq_full() to the lockless version, this will be done in a future kernel version. [1] HEAD commit: 8596e589b787732c8346f0482919e83cc9362db1 BUG: KCSAN: data-race in skb_queue_tail / unix_dgram_poll write to 0xffff88814eeb24e0 of 4 bytes by task 25815 on cpu 0: __skb_insert include/linux/skbuff.h:1938 [inline] __skb_queue_before include/linux/skbuff.h:2043 [inline] __skb_queue_tail include/linux/skbuff.h:2076 [inline] skb_queue_tail+0x80/0xa0 net/core/skbuff.c:3264 unix_dgram_sendmsg+0xff2/0x1600 net/unix/af_unix.c:1850 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2532 __do_sys_sendmmsg net/socket.c:2561 [inline] __se_sys_sendmmsg net/socket.c:2558 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2558 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88814eeb24e0 of 4 bytes by task 25834 on cpu 1: skb_queue_len include/linux/skbuff.h:1869 [inline] unix_recvq_full net/unix/af_unix.c:194 [inline] unix_dgram_poll+0x2bc/0x3e0 net/unix/af_unix.c:2777 sock_poll+0x23e/0x260 net/socket.c:1288 vfs_poll include/linux/poll.h:90 [inline] ep_item_poll fs/eventpoll.c:846 [inline] ep_send_events fs/eventpoll.c:1683 [inline] ep_poll fs/eventpoll.c:1798 [inline] do_epoll_wait+0x6ad/0xf00 fs/eventpoll.c:2226 __do_sys_epoll_wait fs/eventpoll.c:2238 [inline] __se_sys_epoll_wait fs/eventpoll.c:2233 [inline] __x64_sys_epoll_wait+0xf6/0x120 fs/eventpoll.c:2233 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000001b -> 0x00000001 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 25834 Comm: syz-executor.1 Tainted: G W 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 86b18aaa2b5b ("skbuff: fix a data race in skb_queue_len()") Cc: Qian Cai Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6bdb0db3e825..841e2f0f5240 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1940,7 +1940,7 @@ static inline void __skb_insert(struct sk_buff *newsk, WRITE_ONCE(newsk->prev, prev); WRITE_ONCE(next->prev, newsk); WRITE_ONCE(prev->next, newsk); - list->qlen++; + WRITE_ONCE(list->qlen, list->qlen + 1); } static inline void __skb_queue_splice(const struct sk_buff_head *list, -- cgit v1.2.3 From 937f5d5ec642501d2dd3c91918685de30a932b34 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Thu, 9 Sep 2021 18:01:28 -0700 Subject: Input: ads7846 - remove custom filter handling functions from pdata The functions in the platform data struct to initialize, cleanup and apply custom filters are not in use by any mainline board. Remove support for them to pave the road for more cleanups to come. The enum was moved as it has no users outside of the driver code itself. Signed-off-by: Daniel Mack Reviewed-by: Marco Felsch Link: https://lore.kernel.org/r/20210907200726.2034962-3-daniel@zonque.org Signed-off-by: Dmitry Torokhov --- include/linux/spi/ads7846.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h index 1a5eaef3b7f2..d424c1aadf38 100644 --- a/include/linux/spi/ads7846.h +++ b/include/linux/spi/ads7846.h @@ -1,17 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* linux/spi/ads7846.h */ -/* Touchscreen characteristics vary between boards and models. The - * platform_data for the device's "struct device" holds this information. - * - * It's OK if the min/max values are zero. - */ -enum ads7846_filter { - ADS7846_FILTER_OK, - ADS7846_FILTER_REPEAT, - ADS7846_FILTER_IGNORE, -}; - struct ads7846_platform_data { u16 model; /* 7843, 7845, 7846, 7873. */ u16 vref_delay_usecs; /* 0 for external vref; etc */ @@ -51,10 +40,6 @@ struct ads7846_platform_data { int gpio_pendown_debounce; /* platform specific debounce time for * the gpio_pendown */ int (*get_pendown_state)(void); - int (*filter_init) (const struct ads7846_platform_data *pdata, - void **filter_data); - int (*filter) (void *filter_data, int data_idx, int *val); - void (*filter_cleanup)(void *filter_data); void (*wait_for_sync)(void); bool wakeup; unsigned long irq_flags; -- cgit v1.2.3 From 2f1aaf3ea666b737ad717b3d88667225aca23149 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Sep 2021 08:49:59 -0700 Subject: bpf, mm: Fix lockdep warning triggered by stack_map_get_build_id_offset() Currently the bpf selftest "get_stack_raw_tp" triggered the warning: [ 1411.304463] WARNING: CPU: 3 PID: 140 at include/linux/mmap_lock.h:164 find_vma+0x47/0xa0 [ 1411.304469] Modules linked in: bpf_testmod(O) [last unloaded: bpf_testmod] [ 1411.304476] CPU: 3 PID: 140 Comm: systemd-journal Tainted: G W O 5.14.0+ #53 [ 1411.304479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1411.304481] RIP: 0010:find_vma+0x47/0xa0 [ 1411.304484] Code: de 48 89 ef e8 ba f5 fe ff 48 85 c0 74 2e 48 83 c4 08 5b 5d c3 48 8d bf 28 01 00 00 be ff ff ff ff e8 2d 9f d8 00 85 c0 75 d4 <0f> 0b 48 89 de 48 8 [ 1411.304487] RSP: 0018:ffffabd440403db8 EFLAGS: 00010246 [ 1411.304490] RAX: 0000000000000000 RBX: 00007f00ad80a0e0 RCX: 0000000000000000 [ 1411.304492] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.304494] RBP: ffff9cf5c2f50000 R08: ffff9cf5c3eb25d8 R09: 00000000fffffffe [ 1411.304496] R10: 0000000000000001 R11: 00000000ef974e19 R12: ffff9cf5c39ae0e0 [ 1411.304498] R13: 0000000000000000 R14: 0000000000000000 R15: ffff9cf5c39ae0e0 [ 1411.304501] FS: 00007f00ae754780(0000) GS:ffff9cf5fba00000(0000) knlGS:0000000000000000 [ 1411.304504] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.304506] CR2: 000000003e34343c CR3: 0000000103a98005 CR4: 0000000000370ee0 [ 1411.304508] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.304510] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.304512] Call Trace: [ 1411.304517] stack_map_get_build_id_offset+0x17c/0x260 [ 1411.304528] __bpf_get_stack+0x18f/0x230 [ 1411.304541] bpf_get_stack_raw_tp+0x5a/0x70 [ 1411.305752] RAX: 0000000000000000 RBX: 5541f689495641d7 RCX: 0000000000000000 [ 1411.305756] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.305758] RBP: ffff9cf5c02b2f40 R08: ffff9cf5ca7606c0 R09: ffffcbd43ee02c04 [ 1411.306978] bpf_prog_32007c34f7726d29_bpf_prog1+0xaf/0xd9c [ 1411.307861] R10: 0000000000000001 R11: 0000000000000044 R12: ffff9cf5c2ef60e0 [ 1411.307865] R13: 0000000000000005 R14: 0000000000000000 R15: ffff9cf5c2ef6108 [ 1411.309074] bpf_trace_run2+0x8f/0x1a0 [ 1411.309891] FS: 00007ff485141700(0000) GS:ffff9cf5fae00000(0000) knlGS:0000000000000000 [ 1411.309896] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.311221] syscall_trace_enter.isra.20+0x161/0x1f0 [ 1411.311600] CR2: 00007ff48514d90e CR3: 0000000107114001 CR4: 0000000000370ef0 [ 1411.312291] do_syscall_64+0x15/0x80 [ 1411.312941] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.313803] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 1411.314223] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.315082] RIP: 0033:0x7f00ad80a0e0 [ 1411.315626] Call Trace: [ 1411.315632] stack_map_get_build_id_offset+0x17c/0x260 To reproduce, first build `test_progs` binary: make -C tools/testing/selftests/bpf -j60 and then run the binary at tools/testing/selftests/bpf directory: ./test_progs -t get_stack_raw_tp The warning is due to commit 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") which added mmap_assert_locked() in find_vma() function. The mmap_assert_locked() function asserts that mm->mmap_lock needs to be held. But this is not the case for bpf_get_stack() or bpf_get_stackid() helper (kernel/bpf/stackmap.c), which uses mmap_read_trylock_non_owner() instead. Since mm->mmap_lock is not held in bpf_get_stack[id]() use case, the above warning is emitted during test run. This patch fixed the issue by (1). using mmap_read_trylock() instead of mmap_read_trylock_non_owner() to satisfy lockdep checking in find_vma(), and (2). droping lockdep for mmap_lock right before the irq_work_queue(). The function mmap_read_trylock_non_owner() is also removed since after this patch nobody calls it any more. Fixes: 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") Suggested-by: Jason Gunthorpe Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Reviewed-by: Liam R. Howlett Cc: Luigi Rizzo Cc: Jason Gunthorpe Cc: linux-mm@kvack.org Link: https://lore.kernel.org/bpf/20210909155000.1610299-1-yhs@fb.com --- include/linux/mmap_lock.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 0540f0156f58..3af8f7fb067d 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -144,15 +144,6 @@ static inline void mmap_read_unlock(struct mm_struct *mm) __mmap_lock_trace_released(mm, false); } -static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) -{ - if (mmap_read_trylock(mm)) { - rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); - return true; - } - return false; -} - static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { up_read_non_owner(&mm->mmap_lock); -- cgit v1.2.3 From cb19c107979b8000825060cb5b84dfe9212f4f86 Mon Sep 17 00:00:00 2001 From: Yongqiang Niu Date: Mon, 2 Aug 2021 16:59:32 +0800 Subject: soc: mediatek: mmsys: add comp OVL_2L2/POSTMASK/RDMA4 This patch add some more ddp component OVL_2L2 is ovl which include 2 layers overlay POSTMASK control round corner for display frame RDMA4 read dma buffer Signed-off-by: Yongqiang Niu Reviewed-by: Chun-Kuang Hu Reviewed-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/1627894773-23872-2-git-send-email-yongqiang.niu@mediatek.com Signed-off-by: Matthias Brugger --- include/linux/soc/mediatek/mtk-mmsys.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h index 2228bf6133da..4bba275e235a 100644 --- a/include/linux/soc/mediatek/mtk-mmsys.h +++ b/include/linux/soc/mediatek/mtk-mmsys.h @@ -29,13 +29,16 @@ enum mtk_ddp_comp_id { DDP_COMPONENT_OVL0, DDP_COMPONENT_OVL_2L0, DDP_COMPONENT_OVL_2L1, + DDP_COMPONENT_OVL_2L2, DDP_COMPONENT_OVL1, + DDP_COMPONENT_POSTMASK0, DDP_COMPONENT_PWM0, DDP_COMPONENT_PWM1, DDP_COMPONENT_PWM2, DDP_COMPONENT_RDMA0, DDP_COMPONENT_RDMA1, DDP_COMPONENT_RDMA2, + DDP_COMPONENT_RDMA4, DDP_COMPONENT_UFOE, DDP_COMPONENT_WDMA0, DDP_COMPONENT_WDMA1, -- cgit v1.2.3 From f55e36d5ab76c3097ff36ecea60b91c6b0d80fc8 Mon Sep 17 00:00:00 2001 From: Shai Malin Date: Mon, 13 Sep 2021 10:50:24 +0300 Subject: qed: Improve the stack space of filter_config() As it was reported and discussed in: https://lore.kernel.org/lkml/CAHk-=whF9F89vsfH8E9TGc0tZA-yhzi2Di8wOtquNB5vRkFX5w@mail.gmail.com/ This patch improves the stack space of qede_config_rx_mode() by splitting filter_config() to 3 functions and removing the union qed_filter_type_params. Reported-by: Naresh Kamboju Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: David S. Miller --- include/linux/qed/qed_eth_if.h | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 812a4d751163..4df0bf0a0864 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -145,12 +145,6 @@ struct qed_filter_mcast_params { unsigned char mac[64][ETH_ALEN]; }; -union qed_filter_type_params { - enum qed_filter_rx_mode_type accept_flags; - struct qed_filter_ucast_params ucast; - struct qed_filter_mcast_params mcast; -}; - enum qed_filter_type { QED_FILTER_TYPE_UCAST, QED_FILTER_TYPE_MCAST, @@ -158,11 +152,6 @@ enum qed_filter_type { QED_MAX_FILTER_TYPES, }; -struct qed_filter_params { - enum qed_filter_type type; - union qed_filter_type_params filter; -}; - struct qed_tunn_params { u16 vxlan_port; u8 update_vxlan_port; @@ -314,8 +303,14 @@ struct qed_eth_ops { int (*q_tx_stop)(struct qed_dev *cdev, u8 rss_id, void *handle); - int (*filter_config)(struct qed_dev *cdev, - struct qed_filter_params *params); + int (*filter_config_rx_mode)(struct qed_dev *cdev, + enum qed_filter_rx_mode_type type); + + int (*filter_config_ucast)(struct qed_dev *cdev, + struct qed_filter_ucast_params *params); + + int (*filter_config_mcast)(struct qed_dev *cdev, + struct qed_filter_mcast_params *params); int (*fastpath_stop)(struct qed_dev *cdev); -- cgit v1.2.3 From 537d3af1bee8ad1415fda9b622d1ea6d1ae76dfa Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Mon, 12 Jul 2021 14:39:12 +0200 Subject: rpmsg: Fix rpmsg_create_ept return when RPMSG config is not defined According to the description of the rpmsg_create_ept in rpmsg_core.c the function should return NULL on error. Fixes: 2c8a57088045 ("rpmsg: Provide function stubs for API") Signed-off-by: Arnaud Pouliquen Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/20210712123912.10672-1-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson --- include/linux/rpmsg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index d97dcd049f18..a8dcf8a9ae88 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -231,7 +231,7 @@ static inline struct rpmsg_endpoint *rpmsg_create_ept(struct rpmsg_device *rpdev /* This shouldn't be possible */ WARN_ON(1); - return ERR_PTR(-ENXIO); + return NULL; } static inline int rpmsg_send(struct rpmsg_endpoint *ept, void *data, int len) -- cgit v1.2.3 From 4eb6bd55cfb22ffc20652732340c4962f3ac9a91 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 10 Sep 2021 16:40:39 -0700 Subject: compiler.h: drop fallback overflow checkers Once upgrading the minimum supported version of GCC to 5.1, we can drop the fallback code for !COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW. This is effectively a revert of commit f0907827a8a9 ("compiler.h: enable builtin overflow checkers and add fallback code") Link: https://github.com/ClangBuiltLinux/linux/issues/1438#issuecomment-916745801 Suggested-by: Rasmus Villemoes Signed-off-by: Nick Desaulniers Acked-by: Kees Cook Reviewed-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 13 ---- include/linux/compiler-gcc.h | 4 -- include/linux/overflow.h | 138 +---------------------------------------- 3 files changed, 3 insertions(+), 152 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 49b0ac8b6fd3..3c4de9b6c6e3 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -62,19 +62,6 @@ #define __no_sanitize_coverage #endif -/* - * Not all versions of clang implement the type-generic versions - * of the builtin overflow checkers. Fortunately, clang implements - * __has_builtin allowing us to avoid awkward version - * checks. Unfortunately, we don't know which version of gcc clang - * pretends to be, so the macro may or may not be defined. - */ -#if __has_builtin(__builtin_mul_overflow) && \ - __has_builtin(__builtin_add_overflow) && \ - __has_builtin(__builtin_sub_overflow) -#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 -#endif - #if __has_feature(shadow_call_stack) # define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index cb9217fc60af..3f7f6fa0e051 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -128,10 +128,6 @@ #define __no_sanitize_coverage #endif -#if GCC_VERSION >= 50100 -#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 -#endif - /* * Turn individual warnings and errors on and off locally, depending * on version. diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 0f12345c21fb..4669632bd72b 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -6,12 +6,9 @@ #include /* - * In the fallback code below, we need to compute the minimum and - * maximum values representable in a given type. These macros may also - * be useful elsewhere, so we provide them outside the - * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. - * - * It would seem more obvious to do something like + * We need to compute the minimum and maximum values representable in a given + * type. These macros may also be useful elsewhere. It would seem more obvious + * to do something like: * * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) @@ -54,7 +51,6 @@ static inline bool __must_check __must_check_overflow(bool overflow) return unlikely(overflow); } -#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW /* * For simplicity and code hygiene, the fallback code below insists on * a, b and *d having the same type (similar to the min() and max() @@ -90,134 +86,6 @@ static inline bool __must_check __must_check_overflow(bool overflow) __builtin_mul_overflow(__a, __b, __d); \ })) -#else - - -/* Checking for unsigned overflow is relatively easy without causing UB. */ -#define __unsigned_add_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a + __b; \ - *__d < __a; \ -}) -#define __unsigned_sub_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a - __b; \ - __a < __b; \ -}) -/* - * If one of a or b is a compile-time constant, this avoids a division. - */ -#define __unsigned_mul_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a * __b; \ - __builtin_constant_p(__b) ? \ - __b > 0 && __a > type_max(typeof(__a)) / __b : \ - __a > 0 && __b > type_max(typeof(__b)) / __a; \ -}) - -/* - * For signed types, detecting overflow is much harder, especially if - * we want to avoid UB. But the interface of these macros is such that - * we must provide a result in *d, and in fact we must produce the - * result promised by gcc's builtins, which is simply the possibly - * wrapped-around value. Fortunately, we can just formally do the - * operations in the widest relevant unsigned type (u64) and then - * truncate the result - gcc is smart enough to generate the same code - * with and without the (u64) casts. - */ - -/* - * Adding two signed integers can overflow only if they have the same - * sign, and overflow has happened iff the result has the opposite - * sign. - */ -#define __signed_add_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a + (u64)__b; \ - (((~(__a ^ __b)) & (*__d ^ __a)) \ - & type_min(typeof(__a))) != 0; \ -}) - -/* - * Subtraction is similar, except that overflow can now happen only - * when the signs are opposite. In this case, overflow has happened if - * the result has the opposite sign of a. - */ -#define __signed_sub_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a - (u64)__b; \ - ((((__a ^ __b)) & (*__d ^ __a)) \ - & type_min(typeof(__a))) != 0; \ -}) - -/* - * Signed multiplication is rather hard. gcc always follows C99, so - * division is truncated towards 0. This means that we can write the - * overflow check like this: - * - * (a > 0 && (b > MAX/a || b < MIN/a)) || - * (a < -1 && (b > MIN/a || b < MAX/a) || - * (a == -1 && b == MIN) - * - * The redundant casts of -1 are to silence an annoying -Wtype-limits - * (included in -Wextra) warning: When the type is u8 or u16, the - * __b_c_e in check_mul_overflow obviously selects - * __unsigned_mul_overflow, but unfortunately gcc still parses this - * code and warns about the limited range of __b. - */ - -#define __signed_mul_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - typeof(a) __tmax = type_max(typeof(a)); \ - typeof(a) __tmin = type_min(typeof(a)); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a * (u64)__b; \ - (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ - (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ - (__b == (typeof(__b))-1 && __a == __tmin); \ -}) - - -#define check_add_overflow(a, b, d) __must_check_overflow( \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_add_overflow(a, b, d), \ - __unsigned_add_overflow(a, b, d))) - -#define check_sub_overflow(a, b, d) __must_check_overflow( \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_sub_overflow(a, b, d), \ - __unsigned_sub_overflow(a, b, d))) - -#define check_mul_overflow(a, b, d) __must_check_overflow( \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_mul_overflow(a, b, d), \ - __unsigned_mul_overflow(a, b, d))) - -#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ - /** check_shl_overflow() - Calculate a left-shifted value and check overflow * * @a: Value to be shifted -- cgit v1.2.3 From 4e59869aa6550657cb148ad49835605660ec9b88 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 10 Sep 2021 16:40:46 -0700 Subject: compiler-gcc.h: drop checks for older GCC versions Now that GCC 5.1 is the minimally supported default, drop the values we don't use. Signed-off-by: Nick Desaulniers Reviewed-by: Kees Cook Reviewed-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 3f7f6fa0e051..fd82ce169ce9 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -98,10 +98,8 @@ #if GCC_VERSION >= 70000 #define KASAN_ABI_VERSION 5 -#elif GCC_VERSION >= 50000 +#else #define KASAN_ABI_VERSION 4 -#elif GCC_VERSION >= 40902 -#define KASAN_ABI_VERSION 3 #endif #if __has_attribute(__no_sanitize_address__) -- cgit v1.2.3 From 6d2ef226f2f18d530e48ead0cb5704505628b797 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 13 Sep 2021 10:20:01 -0700 Subject: compiler_attributes.h: drop __has_attribute() support for gcc4 Now that GCC 5.1 is the minimally supported default, the manual workaround for older gcc versions not having __has_attribute() are no longer relevant and can be removed. Signed-off-by: Linus Torvalds --- include/linux/compiler_attributes.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 2487be0e7199..ba417a5c80af 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -20,26 +20,6 @@ * Provide links to the documentation of each supported compiler, if it exists. */ -/* - * __has_attribute is supported on gcc >= 5, clang >= 2.9 and icc >= 17. - * In the meantime, to support gcc < 5, we implement __has_attribute - * by hand. - */ -#ifndef __has_attribute -# define __has_attribute(x) __GCC4_has_attribute_##x -# define __GCC4_has_attribute___assume_aligned__ 1 -# define __GCC4_has_attribute___copy__ 0 -# define __GCC4_has_attribute___designated_init__ 0 -# define __GCC4_has_attribute___externally_visible__ 1 -# define __GCC4_has_attribute___no_caller_saved_registers__ 0 -# define __GCC4_has_attribute___noclone__ 1 -# define __GCC4_has_attribute___no_profile_instrument_function__ 0 -# define __GCC4_has_attribute___nonstring__ 0 -# define __GCC4_has_attribute___no_sanitize_address__ 1 -# define __GCC4_has_attribute___no_sanitize_undefined__ 1 -# define __GCC4_has_attribute___fallthrough__ 0 -#endif - /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alias-function-attribute */ -- cgit v1.2.3 From df26327ea097eb78e7967c45df6b23010c43c28d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 13 Sep 2021 10:29:44 -0700 Subject: Drop some straggling mentions of gcc-4.9 as being stale Fix up the admin-guide README file to the new gcc-5.1 requirement, and remove a stale comment about gcc support for the __assume_aligned__ attribute. Signed-off-by: Linus Torvalds --- include/linux/compiler_attributes.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index ba417a5c80af..ee19cebabcf5 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -54,7 +54,6 @@ * compiler should see some alignment anyway, when the return value is * massaged by 'flags = ptr & 3; ptr &= ~3;'). * - * Optional: only supported since gcc >= 4.9 * Optional: not supported by icc * * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-assume_005faligned-function-attribute -- cgit v1.2.3 From 80f0a1f99983296be587325004acf72dd11eccd8 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Mon, 13 Sep 2021 12:02:56 +0200 Subject: workqueue: annotate alloc_workqueue() as printf This also enables checking of allows alloc_ordered_workqueue(). Signed-off-by: Rolf Eike Beer Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 2ebef6b1a3d6..74d3c1efd9bb 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -399,9 +399,8 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq; * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ -struct workqueue_struct *alloc_workqueue(const char *fmt, - unsigned int flags, - int max_active, ...); +__printf(1, 4) struct workqueue_struct * +alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); /** * alloc_ordered_workqueue - allocate an ordered workqueue -- cgit v1.2.3 From c22ac2a3d4bd83411ebf0b1726e9e5fc4f5e7ebf Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 10 Sep 2021 11:33:50 -0700 Subject: perf: Enable branch record for software events The typical way to access branch record (e.g. Intel LBR) is via hardware perf_event. For CPUs with FREEZE_LBRS_ON_PMI support, PMI could capture reliable LBR. On the other hand, LBR could also be useful in non-PMI scenario. For example, in kretprobe or bpf fexit program, LBR could provide a lot of information on what happened with the function. Add API to use branch record for software use. Note that, when the software event triggers, it is necessary to stop the branch record hardware asap. Therefore, static_call is used to remove some branch instructions in this process. Suggested-by: Peter Zijlstra Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Andrii Nakryiko Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210910183352.3151445-2-songliubraving@fb.com --- include/linux/perf_event.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fe156a8170aa..0cbc5dfe1110 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -57,6 +57,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -1612,4 +1613,26 @@ extern void __weak arch_perf_update_userpage(struct perf_event *event, extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr); #endif +/* + * Snapshot branch stack on software events. + * + * Branch stack can be very useful in understanding software events. For + * example, when a long function, e.g. sys_perf_event_open, returns an + * errno, it is not obvious why the function failed. Branch stack could + * provide very helpful information in this type of scenarios. + * + * On software event, it is necessary to stop the hardware branch recorder + * fast. Otherwise, the hardware register/buffer will be flushed with + * entries of the triggering event. Therefore, static call is used to + * stop the hardware recorder. + */ + +/* + * cnt is the number of entries allocated for entries. + * Return number of entries copied to . + */ +typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, + unsigned int cnt); +DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); + #endif /* _LINUX_PERF_EVENT_H */ -- cgit v1.2.3 From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Sep 2021 01:07:57 +0200 Subject: bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used. Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") embedded per-socket cgroup information into sock->sk_cgrp_data and in order to save 8 bytes in struct sock made both mutually exclusive, that is, when cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2 falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp). The assumption made was "there is no reason to mix the two and this is in line with how legacy and v2 compatibility is handled" as stated in bd1060a1d671. However, with Kubernetes more widely supporting cgroups v2 as well nowadays, this assumption no longer holds, and the possibility of the v1/v2 mixed mode with the v2 root fallback being hit becomes a real security issue. Many of the cgroup v2 BPF programs are also used for policy enforcement, just to pick _one_ example, that is, to programmatically deny socket related system calls like connect(2) or bind(2). A v2 root fallback would implicitly cause a policy bypass for the affected Pods. In production environments, we have recently seen this case due to various circumstances: i) a different 3rd party agent and/or ii) a container runtime such as [0] in the user's environment configuring legacy cgroup v1 net_cls tags, which triggered implicitly mentioned root fallback. Another case is Kubernetes projects like kind [1] which create Kubernetes nodes in a container and also add cgroup namespaces to the mix, meaning programs which are attached to the cgroup v2 root of the cgroup namespace get attached to a non-root cgroup v2 path from init namespace point of view. And the latter's root is out of reach for agents on a kind Kubernetes node to configure. Meaning, any entity on the node setting cgroup v1 net_cls tag will trigger the bypass despite cgroup v2 BPF programs attached to the namespace root. Generally, this mutual exclusiveness does not hold anymore in today's user environments and makes cgroup v2 usage from BPF side fragile and unreliable. This fix adds proper struct cgroup pointer for the cgroup v2 case to struct sock_cgroup_data in order to address these issues; this implicitly also fixes the tradeoffs being made back then with regards to races and refcount leaks as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF programs always operate as expected. [0] https://github.com/nestybox/sysbox/ [1] https://kind.sigs.k8s.io/ Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Stanislav Fomichev Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net --- include/linux/cgroup-defs.h | 107 +++++++++++--------------------------------- include/linux/cgroup.h | 22 +-------- 2 files changed, 28 insertions(+), 101 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e1c705fdfa7c..db2e147e069f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -752,107 +752,54 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains * per-socket cgroup information except for memcg association. * - * On legacy hierarchies, net_prio and net_cls controllers directly set - * attributes on each sock which can then be tested by the network layer. - * On the default hierarchy, each sock is associated with the cgroup it was - * created in and the networking layer can match the cgroup directly. - * - * To avoid carrying all three cgroup related fields separately in sock, - * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer. - * On boot, sock_cgroup_data records the cgroup that the sock was created - * in so that cgroup2 matches can be made; however, once either net_prio or - * net_cls starts being used, the area is overridden to carry prioidx and/or - * classid. The two modes are distinguished by whether the lowest bit is - * set. Clear bit indicates cgroup pointer while set bit prioidx and - * classid. - * - * While userland may start using net_prio or net_cls at any time, once - * either is used, cgroup2 matching no longer works. There is no reason to - * mix the two and this is in line with how legacy and v2 compatibility is - * handled. On mode switch, cgroup references which are already being - * pointed to by socks may be leaked. While this can be remedied by adding - * synchronization around sock_cgroup_data, given that the number of leaked - * cgroups is bound and highly unlikely to be high, this seems to be the - * better trade-off. + * On legacy hierarchies, net_prio and net_cls controllers directly + * set attributes on each sock which can then be tested by the network + * layer. On the default hierarchy, each sock is associated with the + * cgroup it was created in and the networking layer can match the + * cgroup directly. */ struct sock_cgroup_data { - union { -#ifdef __LITTLE_ENDIAN - struct { - u8 is_data : 1; - u8 no_refcnt : 1; - u8 unused : 6; - u8 padding; - u16 prioidx; - u32 classid; - } __packed; -#else - struct { - u32 classid; - u16 prioidx; - u8 padding; - u8 unused : 6; - u8 no_refcnt : 1; - u8 is_data : 1; - } __packed; + struct cgroup *cgroup; /* v2 */ +#ifdef CONFIG_CGROUP_NET_CLASSID + u32 classid; /* v1 */ +#endif +#ifdef CONFIG_CGROUP_NET_PRIO + u16 prioidx; /* v1 */ #endif - u64 val; - }; }; -/* - * There's a theoretical window where the following accessors race with - * updaters and return part of the previous pointer as the prioidx or - * classid. Such races are short-lived and the result isn't critical. - */ static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) { - /* fallback to 1 which is always the ID of the root cgroup */ - return (skcd->is_data & 1) ? skcd->prioidx : 1; +#ifdef CONFIG_CGROUP_NET_PRIO + return READ_ONCE(skcd->prioidx); +#else + return 1; +#endif } static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { - /* fallback to 0 which is the unconfigured default classid */ - return (skcd->is_data & 1) ? skcd->classid : 0; +#ifdef CONFIG_CGROUP_NET_CLASSID + return READ_ONCE(skcd->classid); +#else + return 0; +#endif } -/* - * If invoked concurrently, the updaters may clobber each other. The - * caller is responsible for synchronization. - */ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; - - if (sock_cgroup_prioidx(&skcd_buf) == prioidx) - return; - - if (!(skcd_buf.is_data & 1)) { - skcd_buf.val = 0; - skcd_buf.is_data = 1; - } - - skcd_buf.prioidx = prioidx; - WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +#ifdef CONFIG_CGROUP_NET_PRIO + WRITE_ONCE(skcd->prioidx, prioidx); +#endif } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; - - if (sock_cgroup_classid(&skcd_buf) == classid) - return; - - if (!(skcd_buf.is_data & 1)) { - skcd_buf.val = 0; - skcd_buf.is_data = 1; - } - - skcd_buf.classid = classid; - WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ +#ifdef CONFIG_CGROUP_NET_CLASSID + WRITE_ONCE(skcd->classid, classid); +#endif } #else /* CONFIG_SOCK_CGROUP_DATA */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7bf60454a313..75c151413fda 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -829,33 +829,13 @@ static inline void cgroup_account_cputime_field(struct task_struct *task, */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) -extern spinlock_t cgroup_sk_update_lock; -#endif - -void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - unsigned long v; - - /* - * @skcd->val is 64bit but the following is safe on 32bit too as we - * just need the lower ulong to be written and read atomically. - */ - v = READ_ONCE(skcd->val); - - if (v & 3) - return &cgrp_dfl_root.cgrp; - - return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; -#else - return (struct cgroup *)(unsigned long)skcd->val; -#endif + return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ -- cgit v1.2.3 From efeff6b39b9de4480572c7b0c5eb77204795cb57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Aug 2021 13:28:24 -0700 Subject: rcutorture: Warn on individual rcu_torture_init() error conditions When running rcutorture as a module, any rcu_torture_init() issues will be reflected in the error code from modprobe or insmod, as the case may be. However, these error codes are not available when running rcutorture built-in, for example, when using the kvm.sh script. This commit therefore adds WARN_ON_ONCE() to allow distinguishing rcu_torture_init() errors when running rcutorture built-in. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index 0910c5803f35..24f58e50a94b 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -47,6 +47,14 @@ do { \ } while (0) void verbose_torout_sleep(void); +#define torture_init_error(firsterr) \ +({ \ + int ___firsterr = (firsterr); \ + \ + WARN_ONCE(!IS_MODULE(CONFIG_RCU_TORTURE_TEST) && ___firsterr < 0, "Torture-test initialization failed with error code %d\n", ___firsterr); \ + ___firsterr < 0; \ +}) + /* Definitions for online/offline exerciser. */ #ifdef CONFIG_HOTPLUG_CPU int torture_num_online_cpus(void); -- cgit v1.2.3 From f4c87dbbef2638f6da6e29b5e998e3b1dcdb08ee Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 9 Aug 2021 13:25:13 +0200 Subject: kcsan: Save instruction pointer for scoped accesses Save the instruction pointer for scoped accesses, so that it becomes possible for the reporting code to construct more accurate stack traces that will show the start of the scope. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan-checks.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index 9fd0ad80fef6..5f5965246877 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -100,9 +100,12 @@ void kcsan_set_access_mask(unsigned long mask); /* Scoped access information. */ struct kcsan_scoped_access { struct list_head list; + /* Access information. */ const volatile void *ptr; size_t size; int type; + /* Location where scoped access was set up. */ + unsigned long ip; }; /* * Automatically call kcsan_end_scoped_access() when kcsan_scoped_access goes -- cgit v1.2.3 From 7a8aa39d44564703620d937bb54cdea2d003657f Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 13 Sep 2021 17:05:51 +0100 Subject: nvmem: core: Add stubs for nvmem_cell_read_variable_le_u32/64 if !CONFIG_NVMEM When I added nvmem_cell_read_variable_le_u32() and nvmem_cell_read_variable_le_u64() I forgot to add the "static inline" stub functions for when CONFIG_NVMEM wasn't defined. Add them now. This was causing problems with randconfig builds that compiled `drivers/soc/qcom/cpr.c`. Fixes: 6feba6a62c57 ("PM: AVS: qcom-cpr: Use nvmem_cell_read_variable_le_u32()") Fixes: a28e824fb827 ("nvmem: core: Add functions to make number reading easy") Reported-by: kernel test robot Reviewed-by: Bjorn Andersson Signed-off-by: Douglas Anderson Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20210913160551.12907-1-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-consumer.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h index 923dada24eb4..c0c0cefc3b92 100644 --- a/include/linux/nvmem-consumer.h +++ b/include/linux/nvmem-consumer.h @@ -150,6 +150,20 @@ static inline int nvmem_cell_read_u64(struct device *dev, return -EOPNOTSUPP; } +static inline int nvmem_cell_read_variable_le_u32(struct device *dev, + const char *cell_id, + u32 *val) +{ + return -EOPNOTSUPP; +} + +static inline int nvmem_cell_read_variable_le_u64(struct device *dev, + const char *cell_id, + u64 *val) +{ + return -EOPNOTSUPP; +} + static inline struct nvmem_device *nvmem_device_get(struct device *dev, const char *name) { -- cgit v1.2.3 From 81065b35e2486c024c7aa86caed452e1f01a59d4 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 13 Sep 2021 14:52:39 -0700 Subject: x86/mce: Avoid infinite loop for copy from user recovery There are two cases for machine check recovery: 1) The machine check was triggered by ring3 (application) code. This is the simpler case. The machine check handler simply queues work to be executed on return to user. That code unmaps the page from all users and arranges to send a SIGBUS to the task that triggered the poison. 2) The machine check was triggered in kernel code that is covered by an exception table entry. In this case the machine check handler still queues a work entry to unmap the page, etc. but this will not be called right away because the #MC handler returns to the fix up code address in the exception table entry. Problems occur if the kernel triggers another machine check before the return to user processes the first queued work item. Specifically, the work is queued using the ->mce_kill_me callback structure in the task struct for the current thread. Attempting to queue a second work item using this same callback results in a loop in the linked list of work functions to call. So when the kernel does return to user, it enters an infinite loop processing the same entry for ever. There are some legitimate scenarios where the kernel may take a second machine check before returning to the user. 1) Some code (e.g. futex) first tries a get_user() with page faults disabled. If this fails, the code retries with page faults enabled expecting that this will resolve the page fault. 2) Copy from user code retries a copy in byte-at-time mode to check whether any additional bytes can be copied. On the other side of the fence are some bad drivers that do not check the return value from individual get_user() calls and may access multiple user addresses without noticing that some/all calls have failed. Fix by adding a counter (current->mce_count) to keep track of repeated machine checks before task_work() is called. First machine check saves the address information and calls task_work_add(). Subsequent machine checks before that task_work call back is executed check that the address is in the same page as the first machine check (since the callback will offline exactly one page). Expected worst case is four machine checks before moving on (e.g. one user access with page faults disabled, then a repeat to the same address with page faults enabled ... repeat in copy tail bytes). Just in case there is some code that loops forever enforce a limit of 10. [ bp: Massage commit message, drop noinstr, fix typo, extend panic messages. ] Fixes: 5567d11c21a1 ("x86/mce: Send #MC singal from task work") Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Cc: Link: https://lkml.kernel.org/r/YT/IJ9ziLqmtqEPu@agluck-desk2.amr.corp.intel.com --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1780260f237b..361c7bc72cbb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1468,6 +1468,7 @@ struct task_struct { mce_whole_page : 1, __mce_reserved : 62; struct callback_head mce_kill_me; + int mce_count; #endif #ifdef CONFIG_KRETPROBES -- cgit v1.2.3 From 58877b0824da15698bd85a0a9dbfa8c354e6ecb7 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Thu, 9 Sep 2021 12:11:58 +0530 Subject: usb: core: hcd: Add support for deferring roothub registration It has been observed with certain PCIe USB cards (like Inateck connected to AM64 EVM or J7200 EVM) that as soon as the primary roothub is registered, port status change is handled even before xHC is running leading to cold plug USB devices not detected. For such cases, registering both the root hubs along with the second HCD is required. Add support for deferring roothub registration in usb_add_hcd(), so that both primary and secondary roothubs are registered along with the second HCD. CC: stable@vger.kernel.org # 5.4+ Suggested-by: Mathias Nyman Tested-by: Chris Chiu Acked-by: Alan Stern Signed-off-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20210909064200.16216-2-kishon@ti.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/hcd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 548a028f2dab..2c1fc9212cf2 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -124,6 +124,7 @@ struct usb_hcd { #define HCD_FLAG_RH_RUNNING 5 /* root hub is running? */ #define HCD_FLAG_DEAD 6 /* controller has died? */ #define HCD_FLAG_INTF_AUTHORIZED 7 /* authorize interfaces? */ +#define HCD_FLAG_DEFER_RH_REGISTER 8 /* Defer roothub registration */ /* The flags can be tested using these macros; they are likely to * be slightly faster than test_bit(). @@ -134,6 +135,7 @@ struct usb_hcd { #define HCD_WAKEUP_PENDING(hcd) ((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING)) #define HCD_RH_RUNNING(hcd) ((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING)) #define HCD_DEAD(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEAD)) +#define HCD_DEFER_RH_REGISTER(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEFER_RH_REGISTER)) /* * Specifies if interfaces are authorized by default -- cgit v1.2.3 From 8988bacd6045adf39719e5057e43170f83bd1709 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 31 Aug 2021 17:30:44 +0800 Subject: kobject: unexport kobject_create() in kobject.h The function kobject_create() is only used by one caller, kobject_create_and_add(), no other driver uses it, nor is exported to other modules. However it's still exported in kobject.h, and can sometimes confuse users of kobject.h. Since all users should call kobject_create_and_add(), or if extra attributes are needed, should alloc the memory manually then call kobject_init_and_add(). Signed-off-by: Qu Wenruo Link: https://lore.kernel.org/r/20210831093044.110729-1-wqu@suse.com Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index ea30529fba08..efd56f990a46 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -101,7 +101,6 @@ int kobject_init_and_add(struct kobject *kobj, extern void kobject_del(struct kobject *kobj); -extern struct kobject * __must_check kobject_create(void); extern struct kobject * __must_check kobject_create_and_add(const char *name, struct kobject *parent); -- cgit v1.2.3 From 82bcb7fb649844a561ff1ac2e2ace4252bdff793 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Mon, 23 Aug 2021 14:22:01 +0300 Subject: iio: st_sensors: remove st_sensors_deallocate_trigger() function This change converts the st_sensors_allocate_trigger() to use device-managed functions. The parent device of the IIO device object is used. This is based on the assumption that all other devm_ calls in the ST sensors use this reference. That makes the st_sensors_deallocate_trigger() function un-needed, so it can be removed. Reviewed-by: Andy Shevchenko Signed-off-by: Alexandru Ardelean Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20210823112204.243255-3-aardelean@deviqon.com Signed-off-by: Jonathan Cameron --- include/linux/iio/common/st_sensors.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index 8bdbaf3f3796..e74b55244f35 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -273,7 +273,6 @@ irqreturn_t st_sensors_trigger_handler(int irq, void *p); int st_sensors_allocate_trigger(struct iio_dev *indio_dev, const struct iio_trigger_ops *trigger_ops); -void st_sensors_deallocate_trigger(struct iio_dev *indio_dev); int st_sensors_validate_device(struct iio_trigger *trig, struct iio_dev *indio_dev); #else @@ -282,10 +281,6 @@ static inline int st_sensors_allocate_trigger(struct iio_dev *indio_dev, { return 0; } -static inline void st_sensors_deallocate_trigger(struct iio_dev *indio_dev) -{ - return; -} #define st_sensors_validate_device NULL #endif -- cgit v1.2.3 From 5363c6c17b1014f696bf0b2984af4b694062ce8f Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Mon, 23 Aug 2021 14:22:02 +0300 Subject: iio: st_sensors: remove st_sensors_power_disable() function This change converts the st_sensors_power_enable() function to use devm_add_action_or_reset() handlers to register regulator_disable hooks for when the drivers get unloaded. The parent device of the IIO device object is used. This is based on the assumption that all other devm_ calls in the ST sensors use this reference. This makes the st_sensors_power_disable() un-needed. Removing this also changes unload order a bit, as all ST drivers would call st_sensors_power_disable() first and iio_device_unregister() after that. Reviewed-by: Andy Shevchenko Signed-off-by: Alexandru Ardelean Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20210823112204.243255-4-aardelean@deviqon.com Signed-off-by: Jonathan Cameron --- include/linux/iio/common/st_sensors.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index e74b55244f35..fc90c202d15e 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -293,8 +293,6 @@ int st_sensors_set_axis_enable(struct iio_dev *indio_dev, u8 axis_enable); int st_sensors_power_enable(struct iio_dev *indio_dev); -void st_sensors_power_disable(struct iio_dev *indio_dev); - int st_sensors_debugfs_reg_access(struct iio_dev *indio_dev, unsigned reg, unsigned writeval, unsigned *readval); -- cgit v1.2.3 From 6b658c31bb6bc033f6ae60141c676383fb78784c Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Mon, 23 Aug 2021 14:22:03 +0300 Subject: iio: st_sensors: remove all driver remove functions At this point all ST driver remove functions do iio_device_unregister(). This change removes them from them and replaces all iio_device_register() with devm_iio_device_register(). This can be done in a single change relatively easy, since all these remove functions are define in st_sensors.h. Reviewed-by: Andy Shevchenko Signed-off-by: Alexandru Ardelean Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20210823112204.243255-5-aardelean@deviqon.com Signed-off-by: Jonathan Cameron --- include/linux/iio/common/st_sensors.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index fc90c202d15e..d17ae1e5ca19 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -323,21 +323,17 @@ void st_sensors_dev_name_probe(struct device *dev, char *name, int len); /* Accelerometer */ const struct st_sensor_settings *st_accel_get_settings(const char *name); int st_accel_common_probe(struct iio_dev *indio_dev); -void st_accel_common_remove(struct iio_dev *indio_dev); /* Gyroscope */ const struct st_sensor_settings *st_gyro_get_settings(const char *name); int st_gyro_common_probe(struct iio_dev *indio_dev); -void st_gyro_common_remove(struct iio_dev *indio_dev); /* Magnetometer */ const struct st_sensor_settings *st_magn_get_settings(const char *name); int st_magn_common_probe(struct iio_dev *indio_dev); -void st_magn_common_remove(struct iio_dev *indio_dev); /* Pressure */ const struct st_sensor_settings *st_press_get_settings(const char *name); int st_press_common_probe(struct iio_dev *indio_dev); -void st_press_common_remove(struct iio_dev *indio_dev); #endif /* ST_SENSORS_H */ -- cgit v1.2.3 From e42696515414a15774c80f1d454194ce0cd9f145 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Mon, 23 Aug 2021 14:22:04 +0300 Subject: iio: st_sensors: remove reference to parent device object on st_sensor_data The idea behind it, is that all devm_ calls in ST sensors are bound to the parent device object. However, the reference to that object is kept on both the st_sensor_data struct and the IIO object parent (indio_dev->dev.parent). This change only adds a bit consistency and uses the reference stored on indio_dev->dev.parent, to enforce the assumption that all ST sensors' devm_ calls are bound to the same reference as the one store on st_sensor_data. Reviewed-by: Andy Shevchenko Signed-off-by: Alexandru Ardelean Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20210823112204.243255-6-aardelean@deviqon.com Signed-off-by: Jonathan Cameron --- include/linux/iio/common/st_sensors.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index d17ae1e5ca19..22f67845cdd3 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -220,7 +220,6 @@ struct st_sensor_settings { /** * struct st_sensor_data - ST sensor device status - * @dev: Pointer to instance of struct device (I2C or SPI). * @trig: The trigger in use by the core driver. * @mount_matrix: The mounting matrix of the sensor. * @sensor_settings: Pointer to the specific sensor settings in use. @@ -240,7 +239,6 @@ struct st_sensor_settings { * @buffer_data: Data used by buffer part. */ struct st_sensor_data { - struct device *dev; struct iio_trigger *trig; struct iio_mount_matrix mount_matrix; struct st_sensor_settings *sensor_settings; -- cgit v1.2.3 From 8fb0f47a9d7acf620d0fd97831b69da9bc5e22ed Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 10 Sep 2021 11:18:36 -0600 Subject: iov_iter: add helper to save iov_iter state In an ideal world, when someone is passed an iov_iter and returns X bytes, then X bytes would have been consumed/advanced from the iov_iter. But we have use cases that always consume the entire iterator, a few examples of that are iomap and bdev O_DIRECT. This means we cannot rely on the state of the iov_iter once we've called ->read_iter() or ->write_iter(). This would be easier if we didn't always have to deal with truncate of the iov_iter, as rewinding would be trivial without that. We recently added a commit to track the truncate state, but that grew the iov_iter by 8 bytes and wasn't the best solution. Implement a helper to save enough of the iov_iter state to sanely restore it after we've called the read/write iterator helpers. This currently only works for IOVEC/BVEC/KVEC as that's all we need, support for other iterator types are left as an exercise for the reader. Link: https://lore.kernel.org/linux-fsdevel/CAHk-=wiacKV4Gh-MYjteU0LwNBSGpWrK-Ov25HdqB1ewinrFPg@mail.gmail.com/ Signed-off-by: Jens Axboe --- include/linux/uio.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 5265024e8b90..984c4ab74859 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -27,6 +27,12 @@ enum iter_type { ITER_DISCARD, }; +struct iov_iter_state { + size_t iov_offset; + size_t count; + unsigned long nr_segs; +}; + struct iov_iter { u8 iter_type; bool data_source; @@ -55,6 +61,14 @@ static inline enum iter_type iov_iter_type(const struct iov_iter *i) return i->iter_type; } +static inline void iov_iter_save_state(struct iov_iter *iter, + struct iov_iter_state *state) +{ + state->iov_offset = iter->iov_offset; + state->count = iter->count; + state->nr_segs = iter->nr_segs; +} + static inline bool iter_is_iovec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_IOVEC; @@ -233,6 +247,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); +void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); -- cgit v1.2.3 From 2935662449dfa4467d2e769a70801c608fd510c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Sep 2021 07:41:10 +0200 Subject: kernfs: remove kernfs_create_file and kernfs_create_file_ns All callers actually use __kernfs_create_file. Acked-by: Christian Brauner Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210913054121.616001-3-hch@lst.de Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 1093abf7c28c..cecfeedb7361 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -568,30 +568,6 @@ kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, priv, NULL); } -static inline struct kernfs_node * -kernfs_create_file_ns(struct kernfs_node *parent, const char *name, - umode_t mode, kuid_t uid, kgid_t gid, - loff_t size, const struct kernfs_ops *ops, - void *priv, const void *ns) -{ - struct lock_class_key *key = NULL; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - key = (struct lock_class_key *)&ops->lockdep_key; -#endif - return __kernfs_create_file(parent, name, mode, uid, gid, - size, ops, priv, ns, key); -} - -static inline struct kernfs_node * -kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, - loff_t size, const struct kernfs_ops *ops, void *priv) -{ - return kernfs_create_file_ns(parent, name, mode, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, - size, ops, priv, NULL); -} - static inline int kernfs_remove_by_name(struct kernfs_node *parent, const char *name) { -- cgit v1.2.3 From eaf501e0d8af691e532b6b9aee511659cf5ee00c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Sep 2021 07:41:11 +0200 Subject: kernfs: remove the unused lockdep_key field in struct kernfs_ops Not actually used anywhere. Acked-by: Christian Brauner Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210913054121.616001-4-hch@lst.de Signed-off-by: Greg Kroah-Hartman --- include/linux/kernfs.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index cecfeedb7361..3ccce6f24548 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -269,10 +269,6 @@ struct kernfs_ops { struct poll_table_struct *pt); int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lock_class_key lockdep_key; -#endif }; /* -- cgit v1.2.3 From a2aec2c86ef0cad0fd6be718cfeb5cf5eefbfca9 Mon Sep 17 00:00:00 2001 From: "GONG, Ruiqi" Date: Mon, 30 Aug 2021 16:33:56 +0800 Subject: mtd: Remove obsolete macros only used by the old nand_ecclayout struct All uses of MTD_MAX_{OOBFREE,ECCPOS}_ENTRIES_LARGE have been removed as commit ef5eeea6e911 ("mtd: nand: brcm: switch to mtd_ooblayout_ops") and commit aab616e31d1c ("mtd: kill the nand_ecclayout struct") replaced struct nand_ecclayout by the new mtd_ooblayout_ops interface. Remove these two macros therefore. Reported-by: Yi Yang Signed-off-by: GONG, Ruiqi Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20210830083356.31702-1-gongruiqi1@huawei.com --- include/linux/mtd/mtd.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 88227044fc86..f5e7dfc2e4e9 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -72,8 +72,6 @@ struct mtd_oob_ops { uint8_t *oobbuf; }; -#define MTD_MAX_OOBFREE_ENTRIES_LARGE 32 -#define MTD_MAX_ECCPOS_ENTRIES_LARGE 640 /** * struct mtd_oob_region - oob region definition * @offset: region offset -- cgit v1.2.3 From 356ed64991c6847a0c4f2e8fa3b1133f7a14f1fc Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 14 Sep 2021 10:33:51 +0800 Subject: bpf: Handle return value of BPF_PROG_TYPE_STRUCT_OPS prog Currently if a function ptr in struct_ops has a return value, its caller will get a random return value from it, because the return value of related BPF_PROG_TYPE_STRUCT_OPS prog is just dropped. So adding a new flag BPF_TRAMP_F_RET_FENTRY_RET to tell bpf trampoline to save and return the return value of struct_ops prog if ret_size of the function ptr is greater than 0. Also restricting the flag to be used alone. Fixes: 85d33df357b6 ("bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS") Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210914023351.3664499-1-houtao1@huawei.com --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f4c16f19f83e..020a7d5bf470 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -578,11 +578,12 @@ struct btf_func_model { * programs only. Should not be used with normal calls and indirect calls. */ #define BPF_TRAMP_F_SKIP_FRAME BIT(2) - /* Store IP address of the caller on the trampoline stack, * so it's available for trampoline's programs. */ #define BPF_TRAMP_F_IP_ARG BIT(3) +/* Return the return value of fentry prog. Only used by bpf_struct_ops. */ +#define BPF_TRAMP_F_RET_FENTRY_RET BIT(4) /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 -- cgit v1.2.3 From 77e02cf57b6cff9919949defb7fd9b8ac16399a2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 14 Sep 2021 13:23:22 -0700 Subject: memblock: introduce saner 'memblock_free_ptr()' interface The boot-time allocation interface for memblock is a mess, with 'memblock_alloc()' returning a virtual pointer, but then you are supposed to free it with 'memblock_free()' that takes a _physical_ address. Not only is that all kinds of strange and illogical, but it actually causes bugs, when people then use it like a normal allocation function, and it fails spectacularly on a NULL pointer: https://lore.kernel.org/all/20210912140820.GD25450@xsang-OptiPlex-9020/ or just random memory corruption if the debug checks don't catch it: https://lore.kernel.org/all/61ab2d0c-3313-aaab-514c-e15b7aa054a0@suse.cz/ I really don't want to apply patches that treat the symptoms, when the fundamental cause is this horribly confusing interface. I started out looking at just automating a sane replacement sequence, but because of this mix or virtual and physical addresses, and because people have used the "__pa()" macro that can take either a regular kernel pointer, or just the raw "unsigned long" address, it's all quite messy. So this just introduces a new saner interface for freeing a virtual address that was allocated using 'memblock_alloc()', and that was kept as a regular kernel pointer. And then it converts a couple of users that are obvious and easy to test, including the 'xbc_nodes' case in lib/bootconfig.c that caused problems. Reported-by: kernel test robot Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed") Cc: Steven Rostedt Cc: Mike Rapoport Cc: Andrew Morton Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Vlastimil Babka Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b066024c62e3..34de69b3b8ba 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -118,6 +118,7 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); void memblock_free_all(void); +void memblock_free_ptr(void *ptr, size_t size); void reset_node_managed_pages(pg_data_t *pgdat); void reset_all_zones_managed_pages(void); -- cgit v1.2.3 From 67f1e027c27054e641584655020a417eaac9cb3a Mon Sep 17 00:00:00 2001 From: Lukas Prediger Date: Tue, 14 Sep 2021 00:09:42 +0100 Subject: drivers/cdrom: improved ioctl for media change detection The current implementation of the CDROM_MEDIA_CHANGED ioctl relies on global state, meaning that only one process can detect a disc change while the ioctl call will return 0 for other calling processes afterwards (see bug 213267). This introduces a new cdrom ioctl, CDROM_TIMED_MEDIA_CHANGE, that works by maintaining a timestamp of the last detected disc change instead of a boolean flag: Processes calling this ioctl command can provide a timestamp of the last disc change known to them and receive an indication whether the disc was changed since then and the updated timestamp. I considered fixing the buggy behavior in the original CDROM_MEDIA_CHANGED ioctl but that would require maintaining state for each calling process in the kernel, which seems like a worse solution than introducing this new ioctl. Signed-off-by: Lukas Prediger Link: https://lore.kernel.org/all/20210912191207.74449-1-lumip@lumip.de Signed-off-by: Phillip Potter Link: https://lore.kernel.org/r/20210913230942.1188-1-phil@philpotter.co.uk Signed-off-by: Jens Axboe --- include/linux/cdrom.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index c4fef00abdf3..0a89f111e00e 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -64,6 +64,7 @@ struct cdrom_device_info { int for_data; int (*exit)(struct cdrom_device_info *); int mrw_mode_page; + __s64 last_media_change_ms; }; struct cdrom_device_ops { -- cgit v1.2.3 From e25b694bf1d9ef4a3f36c0b85348f8e780f22139 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:05 +0200 Subject: x86: Always inline context_tracking_guest_enter() Yes, it really did out-of-line this.... vmlinux.o: warning: objtool: vmx_vcpu_enter_exit()+0x31: call to context_tracking_guest_enter() leaves .noinstr.text section 000000000019f660 : 19f660: e8 00 00 00 00 callq 19f665 19f661: R_X86_64_PLT32 __sanitizer_cov_trace_pc-0x4 19f665: 31 c0 xor %eax,%eax 19f667: c3 retq Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095148.003928226@infradead.org --- include/linux/context_tracking.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 4d7fced3a39f..7a14807c9d1a 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -105,7 +105,7 @@ static inline void user_exit_irqoff(void) { } static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } -static inline bool context_tracking_guest_enter(void) { return false; } +static __always_inline bool context_tracking_guest_enter(void) { return false; } static inline void context_tracking_guest_exit(void) { } #endif /* !CONFIG_CONTEXT_TRACKING */ -- cgit v1.2.3 From 7dedd3e18077f996a10c47250ac85d080e5f474e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 10 Sep 2021 11:19:58 -0600 Subject: Revert "iov_iter: track truncated size" This reverts commit 2112ff5ce0c1128fe7b4d19cfe7f2b8ce5b595fa. We no longer need to track the truncation count, the one user that did need it has been converted to using iov_iter_restore() instead. Signed-off-by: Jens Axboe --- include/linux/uio.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 984c4ab74859..207101a9c5c3 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -53,7 +53,6 @@ struct iov_iter { }; loff_t xarray_start; }; - size_t truncated; }; static inline enum iter_type iov_iter_type(const struct iov_iter *i) @@ -270,10 +269,8 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ - if (i->count > count) { - i->truncated += i->count - count; + if (i->count > count) i->count = count; - } } /* @@ -282,7 +279,6 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { - i->truncated -= count - i->count; i->count = count; } -- cgit v1.2.3 From a5c071ccfa1728508f31e61213ee795e4529d0d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Jul 2021 12:28:27 -0700 Subject: rcu-tasks: Remove second argument of rcu_read_unlock_trace_special() The second argument of rcu_read_unlock_trace_special() is always zero. When called from exit_tasks_rcu_finish_trace(), it is the constant zero, and rcu_read_unlock_trace_special() doesn't get called from rcu_read_unlock_trace() unless the value of local variable "nesting" is zero because in that case the early return is taken instead. This commit therefore removes the "nesting" argument from the rcu_read_unlock_trace_special() function, substituting the constant zero within that function. This commit also adds a WARN_ON_ONCE() to rcu_read_lock_trace_held() in case non-zeroness some day appears. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index 86c8f6c98412..6f9c35817398 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -31,7 +31,7 @@ static inline int rcu_read_lock_trace_held(void) #ifdef CONFIG_TASKS_TRACE_RCU -void rcu_read_unlock_trace_special(struct task_struct *t, int nesting); +void rcu_read_unlock_trace_special(struct task_struct *t); /** * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section @@ -80,7 +80,8 @@ static inline void rcu_read_unlock_trace(void) WRITE_ONCE(t->trc_reader_nesting, nesting); return; // We assume shallow reader nesting. } - rcu_read_unlock_trace_special(t, nesting); + WARN_ON_ONCE(nesting != 0); + rcu_read_unlock_trace_special(t); } void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); -- cgit v1.2.3 From f6b5f1a56987de837f8e25cd560847106b8632a8 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 14 Sep 2021 20:52:24 -0700 Subject: compiler.h: Introduce absolute_pointer macro absolute_pointer() disassociates a pointer from its originating symbol type and context. Use it to prevent compiler warnings/errors such as drivers/net/ethernet/i825xx/82596.c: In function 'i82596_probe': arch/m68k/include/asm/string.h:72:25: error: '__builtin_memcpy' reading 6 bytes from a region of size 0 [-Werror=stringop-overread] Such warnings may be reported by gcc 11.x for string and memory operations on fixed addresses. Suggested-by: Linus Torvalds Signed-off-by: Guenter Roeck Reviewed-by: Geert Uytterhoeven Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index b67261a1e3e9..3d5af56337bd 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -188,6 +188,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, (typeof(ptr)) (__ptr + (off)); }) #endif +#define absolute_pointer(val) RELOC_HIDE((void *)(val), 0) + #ifndef OPTIMIZER_HIDE_VAR /* Make the optimizer believe the variable can be manipulated arbitrarily. */ #define OPTIMIZER_HIDE_VAR(var) \ -- cgit v1.2.3 From 925da92ba5cb0c82d07cdd5049a07e40f54e9c44 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 26 Aug 2021 22:21:22 -0400 Subject: rcu: Avoid unneeded function call in rcu_read_unlock() Since commit aa40c138cc8f3 ("rcu: Report QS for outermost PREEMPT=n rcu_read_unlock() for strict GPs") the function rcu_read_unlock_strict() is invoked by the inlined rcu_read_unlock() function. However, rcu_read_unlock_strict() is an empty function in production kernels, which are built with CONFIG_RCU_STRICT_GRACE_PERIOD=n. There is a mention of rcu_read_unlock_strict() in the BPF verifier, but this is in a deny-list, meaning that BPF does not care whether rcu_read_unlock_strict() is ever called. This commit therefore provides a slight performance improvement by hoisting the check of CONFIG_RCU_STRICT_GRACE_PERIOD from rcu_read_unlock_strict() into rcu_read_unlock(), thus avoiding the pointless call to an empty function. Cc: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Waiman Long Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 434d12fe2d4f..5e0beb5c5659 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -71,7 +71,8 @@ static inline void __rcu_read_lock(void) static inline void __rcu_read_unlock(void) { preempt_enable(); - rcu_read_unlock_strict(); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + rcu_read_unlock_strict(); } static inline int rcu_preempt_depth(void) -- cgit v1.2.3 From 8dc84dcd7f74b50f81de3dbf6f6b5b146e3a8eea Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 16 Sep 2021 14:27:41 -0700 Subject: net: phy: broadcom: Enable 10BaseT DAC early wake Enable the DAC early wake when then link operates at 10BaseT allows power savings in the hundreds of milli Watts by shutting down the transmitter. A number of errata have been issued for various Gigabit PHYs and the recommendation is to enable both the early and forced DAC wake to be on the safe side. This needs to be done dynamically based upon the link state, which is why a link_change_notify callback is utilized. Signed-off-by: Florian Fainelli Link: https://lore.kernel.org/r/20210916212742.1653088-1-f.fainelli@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index c2c2147dfeb8..3308cebe1c19 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -233,6 +233,7 @@ #define MII_BCM54XX_EXP_EXP08 0x0F08 #define MII_BCM54XX_EXP_EXP08_RJCT_2MHZ 0x0001 #define MII_BCM54XX_EXP_EXP08_EARLY_DAC_WAKE 0x0200 +#define MII_BCM54XX_EXP_EXP08_FORCE_DAC_WAKE 0x0100 #define MII_BCM54XX_EXP_EXP75 0x0f75 #define MII_BCM54XX_EXP_EXP75_VDACCTRL 0x003c #define MII_BCM54XX_EXP_EXP75_CM_OSC 0x0001 -- cgit v1.2.3 From 3c9cfb5269f76d447dbadb67835368f3111a91d7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Sep 2021 14:17:35 +0300 Subject: net: update NXP copyright text NXP Legal insists that the following are not fine: - Saying "NXP Semiconductors" instead of "NXP", since the company's registered name is "NXP" - Putting a "(c)" sign in the copyright string - Putting a comma in the copyright string The only accepted copyright string format is "Copyright NXP". This patch changes the copyright headers in the networking files that were sent by me, or derived from code sent by me. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/ocelot.h | 2 +- include/linux/packing.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index c6bc45ae5e03..435777a0073c 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 - * Copyright 2019-2021 NXP Semiconductors + * Copyright 2019-2021 NXP */ #ifndef _NET_DSA_TAG_OCELOT_H diff --git a/include/linux/packing.h b/include/linux/packing.h index 54667735cc67..8d6571feb95d 100644 --- a/include/linux/packing.h +++ b/include/linux/packing.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: BSD-3-Clause - * Copyright (c) 2016-2018, NXP Semiconductors + * Copyright 2016-2018 NXP * Copyright (c) 2018-2019, Vladimir Oltean */ #ifndef _LINUX_PACKING_H -- cgit v1.2.3 From 12235da8c80a1f9909008e4ca6036d5772b81192 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 9 Sep 2021 11:32:18 +0200 Subject: kernel/locking: Add context to ww_mutex_trylock() i915 will soon gain an eviction path that trylock a whole lot of locks for eviction, getting dmesg failures like below: BUG: MAX_LOCK_DEPTH too low! turning off the locking correctness validator. depth: 48 max: 48! 48 locks held by i915_selftest/5776: #0: ffff888101a79240 (&dev->mutex){....}-{3:3}, at: __driver_attach+0x88/0x160 #1: ffffc900009778c0 (reservation_ww_class_acquire){+.+.}-{0:0}, at: i915_vma_pin.constprop.63+0x39/0x1b0 [i915] #2: ffff88800cf74de8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_vma_pin.constprop.63+0x5f/0x1b0 [i915] #3: ffff88810c7f9e38 (&vm->mutex/1){+.+.}-{3:3}, at: i915_vma_pin_ww+0x1c4/0x9d0 [i915] #4: ffff88810bad5768 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915] #5: ffff88810bad60e8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915] ... #46: ffff88811964d768 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915] #47: ffff88811964e0e8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915] INFO: lockdep is turned off. Fixing eviction to nest into ww_class_acquire is a high priority, but it requires a rework of the entire driver, which can only be done one step at a time. As an intermediate solution, add an acquire context to ww_mutex_trylock, which allows us to do proper nesting annotations on the trylocks, making the above lockdep splat disappear. This is also useful in regulator_lock_nested, which may avoid dropping regulator_nesting_mutex in the uncontended path, so use it there. TTM may be another user for this, where we could lock a buffer in a fastpath with list locks held, without dropping all locks we hold. [peterz: rework actual ww_mutex_trylock() implementations] Signed-off-by: Maarten Lankhorst Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YUBGPdDDjKlxAuXJ@hirez.programming.kicks-ass.net --- include/linux/dma-resv.h | 2 +- include/linux/ww_mutex.h | 15 ++------------- 2 files changed, 3 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h index e1ca2080a1ff..39fefb86780b 100644 --- a/include/linux/dma-resv.h +++ b/include/linux/dma-resv.h @@ -173,7 +173,7 @@ static inline int dma_resv_lock_slow_interruptible(struct dma_resv *obj, */ static inline bool __must_check dma_resv_trylock(struct dma_resv *obj) { - return ww_mutex_trylock(&obj->lock); + return ww_mutex_trylock(&obj->lock, NULL); } /** diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 29db736af86d..bb763085479a 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -28,12 +28,10 @@ #ifndef CONFIG_PREEMPT_RT #define WW_MUTEX_BASE mutex #define ww_mutex_base_init(l,n,k) __mutex_init(l,n,k) -#define ww_mutex_base_trylock(l) mutex_trylock(l) #define ww_mutex_base_is_locked(b) mutex_is_locked((b)) #else #define WW_MUTEX_BASE rt_mutex #define ww_mutex_base_init(l,n,k) __rt_mutex_init(l,n,k) -#define ww_mutex_base_trylock(l) rt_mutex_trylock(l) #define ww_mutex_base_is_locked(b) rt_mutex_base_is_locked(&(b)->rtmutex) #endif @@ -339,17 +337,8 @@ ww_mutex_lock_slow_interruptible(struct ww_mutex *lock, extern void ww_mutex_unlock(struct ww_mutex *lock); -/** - * ww_mutex_trylock - tries to acquire the w/w mutex without acquire context - * @lock: mutex to lock - * - * Trylocks a mutex without acquire context, so no deadlock detection is - * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise. - */ -static inline int __must_check ww_mutex_trylock(struct ww_mutex *lock) -{ - return ww_mutex_base_trylock(&lock->base); -} +extern int __must_check ww_mutex_trylock(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx); /*** * ww_mutex_destroy - mark a w/w mutex unusable -- cgit v1.2.3 From a2e05ddda11b0bd529f443df9089ab498b2c2642 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Wed, 11 Aug 2021 10:59:20 +0800 Subject: lockdep: Improve comments in wait-type checks Comments in wait-type checks be improved by mentioning the PREEPT_RT kernel configure option. Signed-off-by: Zhouyi Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lkml.kernel.org/r/20210811025920.20751-1-zhouzhouyi@gmail.com --- include/linux/lockdep_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 3e726ace5c62..d22430840b53 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -21,7 +21,7 @@ enum lockdep_wait_type { LD_WAIT_SPIN, /* spin loops, raw_spinlock_t etc.. */ #ifdef CONFIG_PROVE_RAW_LOCK_NESTING - LD_WAIT_CONFIG, /* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */ + LD_WAIT_CONFIG, /* preemptible in PREEMPT_RT, spinlock_t etc.. */ #else LD_WAIT_CONFIG = LD_WAIT_SPIN, #endif -- cgit v1.2.3 From f7427ba5ce9c5438ad392b6cbcc4ca8a0487d7e7 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Wed, 25 Aug 2021 15:07:04 +0800 Subject: locking/lockdep: Cleanup the repeated declaration 'struct task_struct' has been decleared twice, so keep the top one and cleanup the repeated one. Signed-off-by: Shaokun Zhang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1629875224-32751-1-git-send-email-zhangshaokun@hisilicon.com --- include/linux/debug_locks.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 3f49e65169c6..dbb409d77d4f 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -47,8 +47,6 @@ extern int debug_locks_off(void); # define locking_selftest() do { } while (0) #endif -struct task_struct; - #ifdef CONFIG_LOCKDEP extern void debug_show_all_locks(void); extern void debug_show_held_locks(struct task_struct *task); -- cgit v1.2.3 From f68d08c437f98ee19a14142b9de2d7afe2032d5c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 17 Sep 2021 11:15:50 -0700 Subject: net: phy: bcm7xxx: Add EPHY entry for 72165 72165 is a 16nm process SoC with a 10/100 integrated Ethernet PHY, create a new macro and set of functions for this different process type. Signed-off-by: Florian Fainelli Link: https://lore.kernel.org/r/20210917181551.2836036-1-f.fainelli@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 3308cebe1c19..07e1dfadbbdf 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -32,6 +32,7 @@ #define PHY_ID_BCM72113 0x35905310 #define PHY_ID_BCM72116 0x35905350 +#define PHY_ID_BCM72165 0x35905340 #define PHY_ID_BCM7250 0xae025280 #define PHY_ID_BCM7255 0xae025120 #define PHY_ID_BCM7260 0xae025190 -- cgit v1.2.3 From 335ff4990cf3bfa42d8846f9b3d8c09456f51801 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 17 Sep 2021 11:29:03 -0700 Subject: bpf: Merge printk and seq_printf VARARG max macros MAX_SNPRINTF_VARARGS and MAX_SEQ_PRINTF_VARARGS are used by bpf helpers bpf_snprintf and bpf_seq_printf to limit their varargs. Both call into bpf_bprintf_prepare for print formatting logic and have convenience macros in libbpf (BPF_SNPRINTF, BPF_SEQ_PRINTF) which use the same helper macros to convert varargs to a byte array. Changing shared functionality to support more varargs for either bpf helper would affect the other as well, so let's combine the _VARARGS macros to make this more obvious. Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210917182911.2426606-2-davemarchevsky@fb.com --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f4c16f19f83e..be8d57e6e78a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2216,6 +2216,8 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); +#define MAX_BPRINTF_VARARGS 12 + int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, u32 **bin_buf, u32 num_args); void bpf_bprintf_cleanup(void); -- cgit v1.2.3 From 10aceb629e198429c849d5e995c3bb1ba7a9aaa3 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 17 Sep 2021 11:29:05 -0700 Subject: bpf: Add bpf_trace_vprintk helper This helper is meant to be "bpf_trace_printk, but with proper vararg support". Follow bpf_snprintf's example and take a u64 pseudo-vararg array. Write to /sys/kernel/debug/tracing/trace_pipe using the same mechanism as bpf_trace_printk. The functionality of this helper was requested in the libbpf issue tracker [0]. [0] Closes: https://github.com/libbpf/libbpf/issues/315 Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210917182911.2426606-4-davemarchevsky@fb.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be8d57e6e78a..b6c45a6cbbba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1088,6 +1088,7 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f int bpf_prog_calc_tag(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); +const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void); typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, unsigned long off, unsigned long len); -- cgit v1.2.3 From 55c42fa7fa331f98062c32799456420930b8bf8c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Sep 2021 16:33:19 -0700 Subject: mptcp: add MPTCP_INFO getsockopt Its not compatible with multipath-tcp.org kernel one. 1. The out-of-tree implementation defines a different 'struct mptcp_info', with embedded __user addresses for additional data such as endpoint addresses. 2. Mat Martineau points out that embedded __user addresses doesn't work with BPF_CGROUP_RUN_PROG_GETSOCKOPT() which assumes that copying in optsize bytes from optval provides all data that got copied to userspace. This provides mptcp_info data for the given mptcp socket. Userspace sets optlen to the size of the structure it expects. The kernel updates it to contain the number of bytes that it copied. This allows to append more information to the structure later. Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/linux/socket.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 041d6032a348..7612d760b6a9 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -364,6 +364,7 @@ struct ucred { #define SOL_KCM 281 #define SOL_TLS 282 #define SOL_XDP 283 +#define SOL_MPTCP 284 /* IPX options */ #define IPX_TYPE 1 -- cgit v1.2.3 From cf9579976f724ad517cc15b7caadea728c7e245c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Sep 2021 16:34:32 +0300 Subject: net: mdio: introduce a shutdown method to mdio device drivers MDIO-attached devices might have interrupts and other things that might need quiesced when we kexec into a new kernel. Things are even more creepy when those interrupt lines are shared, and in that case it is absolutely mandatory to disable all interrupt sources. Moreover, MDIO devices might be DSA switches, and DSA needs its own shutdown method to unlink from the DSA master, which is a new requirement that appeared after commit 2f1e8ea726e9 ("net: dsa: link interfaces with the DSA master to get rid of lockdep warnings"). So introduce a ->shutdown method in the MDIO device driver structure. Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index ffb787d5ebde..5e6dc38f418e 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -80,6 +80,9 @@ struct mdio_driver { /* Clears up any memory if needed */ void (*remove)(struct mdio_device *mdiodev); + + /* Quiesces the device on system shutdown, turns off interrupts etc */ + void (*shutdown)(struct mdio_device *mdiodev); }; static inline struct mdio_driver * -- cgit v1.2.3 From 5bd2182d58e9d9c6279b7a8a2f9b41add0e7f9cb Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 16 Feb 2021 19:46:48 -0500 Subject: audit,io_uring,io-wq: add some basic audit support to io_uring This patch adds basic auditing to io_uring operations, regardless of their context. This is accomplished by allocating audit_context structures for the io-wq worker and io_uring SQPOLL kernel threads as well as explicitly auditing the io_uring operations in io_issue_sqe(). Individual io_uring operations can bypass auditing through the "audit_skip" field in the struct io_op_def definition for the operation; although great care must be taken so that security relevant io_uring operations do not bypass auditing; please contact the audit mailing list (see the MAINTAINERS file) with any questions. The io_uring operations are audited using a new AUDIT_URINGOP record, an example is shown below: type=UNKNOWN[1336] msg=audit(1631800225.981:37289): uring_op=19 success=yes exit=0 items=0 ppid=15454 pid=15681 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) Thanks to Richard Guy Briggs for review and feedback. Signed-off-by: Paul Moore --- include/linux/audit.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 82b7c1116a85..d656a06dd909 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -286,7 +286,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t) /* These are defined in auditsc.c */ /* Public API */ extern int audit_alloc(struct task_struct *task); +extern int audit_alloc_kernel(struct task_struct *task); extern void __audit_free(struct task_struct *task); +extern void __audit_uring_entry(u8 op); +extern void __audit_uring_exit(int success, long code); extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); extern void __audit_syscall_exit(int ret_success, long ret_value); @@ -323,6 +326,21 @@ static inline void audit_free(struct task_struct *task) if (unlikely(task->audit_context)) __audit_free(task); } +static inline void audit_uring_entry(u8 op) +{ + /* + * We intentionally check audit_context() before audit_enabled as most + * Linux systems (as of ~2021) rely on systemd which forces audit to + * be enabled regardless of the user's audit configuration. + */ + if (unlikely(audit_context() && audit_enabled)) + __audit_uring_entry(op); +} +static inline void audit_uring_exit(int success, long code) +{ + if (unlikely(!audit_dummy_context())) + __audit_uring_exit(success, code); +} static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) @@ -554,8 +572,16 @@ static inline int audit_alloc(struct task_struct *task) { return 0; } +static inline int audit_alloc_kernel(struct task_struct *task) +{ + return 0; +} static inline void audit_free(struct task_struct *task) { } +static inline void audit_uring_entry(u8 op) +{ } +static inline void audit_uring_exit(int success, long code) +{ } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) -- cgit v1.2.3 From 3a862cacf8670729b1ad8fc28e4f7e078f9c570c Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 1 Feb 2021 19:22:44 -0500 Subject: fs: add anon_inode_getfile_secure() similar to anon_inode_getfd_secure() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extending the secure anonymous inode support to other subsystems requires that we have a secure anon_inode_getfile() variant in addition to the existing secure anon_inode_getfd() variant. Thankfully we can reuse the existing __anon_inode_getfile() function and just wrap it with the proper arguments. Acked-by: MickaĂ«l SalaĂŒn Signed-off-by: Paul Moore --- include/linux/anon_inodes.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h index 71881a2b6f78..5deaddbd7927 100644 --- a/include/linux/anon_inodes.h +++ b/include/linux/anon_inodes.h @@ -15,6 +15,10 @@ struct inode; struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags); +struct file *anon_inode_getfile_secure(const char *name, + const struct file_operations *fops, + void *priv, int flags, + const struct inode *context_inode); int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags); int anon_inode_getfd_secure(const char *name, -- cgit v1.2.3 From cdc1404a40461faba23c5a5ad40adcc7eecc1580 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 1 Feb 2021 19:56:49 -0500 Subject: lsm,io_uring: add LSM hooks to io_uring A full expalantion of io_uring is beyond the scope of this commit description, but in summary it is an asynchronous I/O mechanism which allows for I/O requests and the resulting data to be queued in memory mapped "rings" which are shared between the kernel and userspace. Optionally, io_uring offers the ability for applications to spawn kernel threads to dequeue I/O requests from the ring and submit the requests in the kernel, helping to minimize the syscall overhead. Rings are accessed in userspace by memory mapping a file descriptor provided by the io_uring_setup(2), and can be shared between applications as one might do with any open file descriptor. Finally, process credentials can be registered with a given ring and any process with access to that ring can submit I/O requests using any of the registered credentials. While the io_uring functionality is widely recognized as offering a vastly improved, and high performing asynchronous I/O mechanism, its ability to allow processes to submit I/O requests with credentials other than its own presents a challenge to LSMs. When a process creates a new io_uring ring the ring's credentials are inhertied from the calling process; if this ring is shared with another process operating with different credentials there is the potential to bypass the LSMs security policy. Similarly, registering credentials with a given ring allows any process with access to that ring to submit I/O requests with those credentials. In an effort to allow LSMs to apply security policy to io_uring I/O operations, this patch adds two new LSM hooks. These hooks, in conjunction with the LSM anonymous inode support previously submitted, allow an LSM to apply access control policy to the sharing of io_uring rings as well as any io_uring credential changes requested by a process. The new LSM hooks are described below: * int security_uring_override_creds(cred) Controls if the current task, executing an io_uring operation, is allowed to override it's credentials with @cred. In cases where the current task is a user application, the current credentials will be those of the user application. In cases where the current task is a kernel thread servicing io_uring requests the current credentials will be those of the io_uring ring (inherited from the process that created the ring). * int security_uring_sqpoll(void) Controls if the current task is allowed to create an io_uring polling thread (IORING_SETUP_SQPOLL). Without a SQPOLL thread in the kernel processes must submit I/O requests via io_uring_enter(2) which allows us to compare any requested credential changes against the application making the request. With a SQPOLL thread, we can no longer compare requested credential changes against the application making the request, the comparison is made against the ring's credentials. Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 5 +++++ include/linux/lsm_hooks.h | 13 +++++++++++++ include/linux/security.h | 16 ++++++++++++++++ 3 files changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 2adeea44c0d5..b3c525353769 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -402,3 +402,8 @@ LSM_HOOK(void, LSM_RET_VOID, perf_event_free, struct perf_event *event) LSM_HOOK(int, 0, perf_event_read, struct perf_event *event) LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) #endif /* CONFIG_PERF_EVENTS */ + +#ifdef CONFIG_IO_URING +LSM_HOOK(int, 0, uring_override_creds, const struct cred *new) +LSM_HOOK(int, 0, uring_sqpoll, void) +#endif /* CONFIG_IO_URING */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 5c4c5c0602cb..0eb0ae95c4c4 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1557,6 +1557,19 @@ * Read perf_event security info if allowed. * @perf_event_write: * Write perf_event security info if allowed. + * + * Security hooks for io_uring + * + * @uring_override_creds: + * Check if the current task, executing an io_uring operation, is allowed + * to override it's credentials with @new. + * + * @new: the new creds to use + * + * @uring_sqpoll: + * Check whether the current task is allowed to spawn a io_uring polling + * thread (IORING_SETUP_SQPOLL). + * */ union security_list_options { #define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__); diff --git a/include/linux/security.h b/include/linux/security.h index 5b7288521300..7979b9629a42 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2038,4 +2038,20 @@ static inline int security_perf_event_write(struct perf_event *event) #endif /* CONFIG_SECURITY */ #endif /* CONFIG_PERF_EVENTS */ +#ifdef CONFIG_IO_URING +#ifdef CONFIG_SECURITY +extern int security_uring_override_creds(const struct cred *new); +extern int security_uring_sqpoll(void); +#else +static inline int security_uring_override_creds(const struct cred *new) +{ + return 0; +} +static inline int security_uring_sqpoll(void) +{ + return 0; +} +#endif /* CONFIG_SECURITY */ +#endif /* CONFIG_IO_URING */ + #endif /* ! __LINUX_SECURITY_H */ -- cgit v1.2.3 From e840f42a49925707fca90e6c7a4095118fdb8c4d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 19 Sep 2021 14:09:49 +0100 Subject: KVM: arm64: Fix PMU probe ordering Russell reported that since 5.13, KVM's probing of the PMU has started to fail on his HW. As it turns out, there is an implicit ordering dependency between the architectural PMU probing code and and KVM's own probing. If, due to probe ordering reasons, KVM probes before the PMU driver, it will fail to detect the PMU and prevent it from being advertised to guests as well as the VMM. Obviously, this is one probing too many, and we should be able to deal with any ordering. Add a callback from the PMU code into KVM to advertise the registration of a host CPU PMU, allowing for any probing order. Fixes: 5421db1be3b1 ("KVM: arm64: Divorce the perf code from oprofile helpers") Reported-by: "Russell King (Oracle)" Tested-by: Russell King (Oracle) Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/YUYRKVflRtUytzy5@shell.armlinux.org.uk Cc: stable@vger.kernel.org --- include/linux/perf/arm_pmu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 505480217cf1..2512e2f9cd4e 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -163,6 +163,12 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn); static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; } #endif +#ifdef CONFIG_KVM +void kvm_host_pmu_init(struct arm_pmu *pmu); +#else +#define kvm_host_pmu_init(x) do { } while(0) +#endif + /* Internal functions only for core arm_pmu code */ struct arm_pmu *armpmu_alloc(void); struct arm_pmu *armpmu_alloc_atomic(void); -- cgit v1.2.3 From f279294b329363eb6ada568e494d609ef78e3e8e Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Fri, 17 Sep 2021 20:44:14 +0800 Subject: misc_cgroup: introduce misc.events to count failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce misc.events to make it easier for us to understand the pressure of resources. Currently only the 'max' event is implemented, which indicates the times the resource is about to exceeds the max limit. Signed-off-by: Chunguang Xu Reviewed-by: Vipin Sharma Reviewed-by: Michal KoutnĂœ Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index da2367e2ac1e..091f2d2a1aec 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -36,6 +36,7 @@ struct misc_cg; struct misc_res { unsigned long max; atomic_long_t usage; + atomic_long_t events; bool failed; }; @@ -46,6 +47,10 @@ struct misc_res { */ struct misc_cg { struct cgroup_subsys_state css; + + /* misc.events */ + struct cgroup_file events_file; + struct misc_res res[MISC_CG_RES_TYPES]; }; -- cgit v1.2.3 From b03357528fd9516600ff358ab5d4b887b8cb80e8 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Fri, 17 Sep 2021 20:44:15 +0800 Subject: misc_cgroup: remove error log to avoid log flood MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In scenarios where containers are frequently created and deleted, a large number of error logs maybe generated. The logs only show which node is about to go over the max limit, not the node which resource request failed. As misc.events has provided relevant information, maybe we can remove this log. Signed-off-by: Chunguang Xu Reviewed-by: Michal KoutnĂœ Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 091f2d2a1aec..c238207d1615 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -37,7 +37,6 @@ struct misc_res { unsigned long max; atomic_long_t usage; atomic_long_t events; - bool failed; }; /** -- cgit v1.2.3 From d4ffd5df9d18031b6a53f934388726775b4452d3 Mon Sep 17 00:00:00 2001 From: Jiashuo Liang Date: Fri, 30 Jul 2021 11:01:52 +0800 Subject: x86/fault: Fix wrong signal when vsyscall fails with pkey The function __bad_area_nosemaphore() calls kernelmode_fixup_or_oops() with the parameter @signal being actually @pkey, which will send a signal numbered with the argument in @pkey. This bug can be triggered when the kernel fails to access user-given memory pages that are protected by a pkey, so it can go down the do_user_addr_fault() path and pass the !user_mode() check in __bad_area_nosemaphore(). Most cases will simply run the kernel fixup code to make an -EFAULT. But when another condition current->thread.sig_on_uaccess_err is met, which is only used to emulate vsyscall, the kernel will generate the wrong signal. Add a new parameter @pkey to kernelmode_fixup_or_oops() to fix this. [ bp: Massage commit message, fix build error as reported by the 0day bot: https://lkml.kernel.org/r/202109202245.APvuT8BX-lkp@intel.com ] Fixes: 5042d40a264c ("x86/fault: Bypass no_context() for implicit kernel faults from usermode") Reported-by: kernel test robot Signed-off-by: Jiashuo Liang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/20210730030152.249106-1-liangjs@pku.edu.cn --- include/linux/pkeys.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index 6beb26b7151d..86be8bf27b41 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -4,6 +4,8 @@ #include +#define ARCH_DEFAULT_PKEY 0 + #ifdef CONFIG_ARCH_HAS_PKEYS #include #else /* ! CONFIG_ARCH_HAS_PKEYS */ -- cgit v1.2.3 From 4373b3dc922038e8924f648506f6556f2afa7e77 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 9 Sep 2021 11:45:13 -0700 Subject: fscrypt: remove fscrypt_operations::max_namelen The max_namelen field is unnecessary, as it is set to 255 (NAME_MAX) on all filesystems that support fscrypt (or plan to support fscrypt). For simplicity, just use NAME_MAX directly instead. Link: https://lore.kernel.org/r/20210909184513.139281-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index e912ed9141d9..91ea9477e9bd 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -118,9 +118,6 @@ struct fscrypt_operations { */ bool (*empty_dir)(struct inode *inode); - /* The filesystem's maximum ciphertext filename length, in bytes */ - unsigned int max_namelen; - /* * Check whether the filesystem's inode numbers and UUID are stable, * meaning that they will never be changed even by offline operations -- cgit v1.2.3 From 9d881361206ebcf6285c2ec2ef275aff80875347 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 21 Sep 2021 12:42:25 +0300 Subject: bus: ti-sysc: Add quirk handling for reinit on context lost Some interconnect target modules such as otg and gpmc on am335x need a re-init after resume. As we also have PM runtime cases where the context may be lost, let's handle these all with cpu_pm. For the am335x resume path, we already have cpu_pm_resume() call cpu_pm_cluster_exit(). Signed-off-by: Tony Lindgren --- include/linux/platform_data/ti-sysc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h index 9837fb011f2f..989aa30c598d 100644 --- a/include/linux/platform_data/ti-sysc.h +++ b/include/linux/platform_data/ti-sysc.h @@ -50,6 +50,7 @@ struct sysc_regbits { s8 emufree_shift; }; +#define SYSC_QUIRK_REINIT_ON_CTX_LOST BIT(28) #define SYSC_QUIRK_REINIT_ON_RESUME BIT(27) #define SYSC_QUIRK_GPMC_DEBUG BIT(26) #define SYSC_MODULE_QUIRK_ENA_RESETDONE BIT(25) -- cgit v1.2.3 From 6a52bc2b81fa06b7ab98472c7d80644e8e071af6 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 21 Sep 2021 12:42:25 +0300 Subject: bus: ti-sysc: Add quirk handling for reset on re-init At least am335x gpmc module needs a reset in addition to re-init on resume. Let's add a quirk handling for reset on re-init. Signed-off-by: Tony Lindgren --- include/linux/platform_data/ti-sysc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h index 989aa30c598d..f11244f5c0b6 100644 --- a/include/linux/platform_data/ti-sysc.h +++ b/include/linux/platform_data/ti-sysc.h @@ -50,6 +50,7 @@ struct sysc_regbits { s8 emufree_shift; }; +#define SYSC_QUIRK_RESET_ON_CTX_LOST BIT(29) #define SYSC_QUIRK_REINIT_ON_CTX_LOST BIT(28) #define SYSC_QUIRK_REINIT_ON_RESUME BIT(27) #define SYSC_QUIRK_GPMC_DEBUG BIT(26) -- cgit v1.2.3 From 5c99fa737c693a50419606d8f8266974a3521b32 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 21 Sep 2021 12:42:25 +0300 Subject: bus: ti-sysc: Handle otg force idle quirk Let's add handling the otg force idle quirk for the old omap2430 glue layer used up to omap4 as the musb driver quirk only works if the driver is loaded. Unlike with the am335x glue layer, looks like we don't need the quirk handling for SYSC_QUIRK_REINIT_ON_CTX_LOST. Eventually when all the musb using SoCs are booting with device tree based configuration, we can just remove the related quirk handling from the musb driver. Signed-off-by: Tony Lindgren --- include/linux/platform_data/ti-sysc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h index f11244f5c0b6..eb556f988d57 100644 --- a/include/linux/platform_data/ti-sysc.h +++ b/include/linux/platform_data/ti-sysc.h @@ -50,6 +50,7 @@ struct sysc_regbits { s8 emufree_shift; }; +#define SYSC_MODULE_QUIRK_OTG BIT(30) #define SYSC_QUIRK_RESET_ON_CTX_LOST BIT(29) #define SYSC_QUIRK_REINIT_ON_CTX_LOST BIT(28) #define SYSC_QUIRK_REINIT_ON_RESUME BIT(27) -- cgit v1.2.3 From d6da08ed1425180b8d54c828ec06d247fd915d60 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 20 Sep 2021 14:54:14 -0700 Subject: net: phy: broadcom: Add IDDQ-SR mode Add support for putting the PHY into IDDQ Soft Recovery mode by setting the TOP_MISC register bits accordingly. This requires us to implement a custom bcm54xx_suspend() routine which diverges from genphy_suspend() in order to configure the PHY to enter IDDQ with software recovery as well as avoid doing a read/modify/write on the BMCR register. Doing a read/modify/write on the BMCR register means that the auto-negotation bit may remain which interferes with the ability to put the PHY into IDDQ-SR mode. We do software reset upon suspend in order to put the PHY back into its state prior to suspend as recommended by the datasheet. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 07e1dfadbbdf..b119d6819d6c 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -67,6 +67,7 @@ #define PHY_BRCM_CLEAR_RGMII_MODE 0x00000004 #define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00000008 #define PHY_BRCM_EN_MASTER_MODE 0x00000010 +#define PHY_BRCM_IDDQ_SUSPEND 0x000000220 /* Broadcom BCM7xxx specific workarounds */ #define PHY_BRCM_7XXX_REV(x) (((x) >> 8) & 0xff) @@ -84,6 +85,7 @@ #define MII_BCM54XX_EXP_DATA 0x15 /* Expansion register data */ #define MII_BCM54XX_EXP_SEL 0x17 /* Expansion register select */ +#define MII_BCM54XX_EXP_SEL_TOP 0x0d00 /* TOP_MISC expansion register select */ #define MII_BCM54XX_EXP_SEL_SSD 0x0e00 /* Secondary SerDes select */ #define MII_BCM54XX_EXP_SEL_ER 0x0f00 /* Expansion register select */ #define MII_BCM54XX_EXP_SEL_ETC 0x0d00 /* Expansion register spare + 2k mem */ @@ -243,6 +245,12 @@ #define MII_BCM54XX_EXP_EXP97 0x0f97 #define MII_BCM54XX_EXP_EXP97_MYST 0x0c0c +/* Top-MISC expansion registers */ +#define BCM54XX_TOP_MISC_IDDQ_CTRL (MII_BCM54XX_EXP_SEL_TOP + 0x06) +#define BCM54XX_TOP_MISC_IDDQ_LP (1 << 0) +#define BCM54XX_TOP_MISC_IDDQ_SD (1 << 2) +#define BCM54XX_TOP_MISC_IDDQ_SR (1 << 3) + /* * BCM5482: Secondary SerDes registers */ -- cgit v1.2.3 From c86a2d9058c5a4a05d20ef89e699b7a6b2c89da6 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 17 Sep 2021 00:27:05 +0200 Subject: cpumask: Omit terminating null byte in cpumap_print_{list,bitmask}_to_buf The changes in the patch series [1] introduced a terminating null byte when reading from cpulist or cpumap sysfs files, for example: $ xxd /sys/devices/system/node/node0/cpulist 00000000: 302d 310a 00 0-1.. Before this change, the output looked as follows: $ xxd /sys/devices/system/node/node0/cpulist 00000000: 302d 310a 0-1. Fix this regression by excluding the terminating null byte from the returned length in cpumap_print_list_to_buf and cpumap_print_bitmask_to_buf. [1] https://lore.kernel.org/all/20210806110251.560-1-song.bao.hua@hisilicon.com/ Fixes: 1fae562983ca ("cpumask: introduce cpumap_print_list/bitmask_to_buf to support large bitmask and list") Acked-by: Barry Song Acked-by: Yury Norov Signed-off-by: Tobias Klauser Link: https://lore.kernel.org/r/20210916222705.13554-1-tklauser@distanz.ch Signed-off-by: Greg Kroah-Hartman --- include/linux/cpumask.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 5d4d07a9e1ed..1e7399fc69c0 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -996,14 +996,15 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) * cpumask; Typically used by bin_attribute to export cpumask bitmask * ABI. * - * Returns the length of how many bytes have been copied. + * Returns the length of how many bytes have been copied, excluding + * terminating '\0'. */ static inline ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask), - nr_cpu_ids, off, count); + nr_cpu_ids, off, count) - 1; } /** @@ -1018,7 +1019,7 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_list_to_buf(buf, cpumask_bits(mask), - nr_cpu_ids, off, count); + nr_cpu_ids, off, count) - 1; } #if NR_CPUS <= BITS_PER_LONG -- cgit v1.2.3 From 06dc660e6eb8817c4c379d2ca290ae0b3f77c69f Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 14 Sep 2021 01:27:08 +1000 Subject: PCI: Rename pcibios_add_device() to pcibios_device_add() The general convention for pcibios_* hooks is that they're named after the corresponding pci_* function they provide a hook for. The exception is pcibios_add_device() which provides a hook for pci_device_add(). Rename pcibios_add_device() to pcibios_device_add() so it matches pci_device_add(). Also, remove the export of the microblaze version. The only caller must be compiled as a built-in so there's no reason for the export. Link: https://lore.kernel.org/r/20210913152709.48013-1-oohall@gmail.com Signed-off-by: Oliver O'Halloran Signed-off-by: Bjorn Helgaas Acked-by: Niklas Schnelle # s390 --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index cd8aa6fce204..7e0ce3a4d5a1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2126,7 +2126,7 @@ void pcibios_disable_device(struct pci_dev *dev); void pcibios_set_master(struct pci_dev *dev); int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state); -int pcibios_add_device(struct pci_dev *dev); +int pcibios_device_add(struct pci_dev *dev); void pcibios_release_device(struct pci_dev *dev); #ifdef CONFIG_PCI void pcibios_penalize_isa_irq(int irq, int active); -- cgit v1.2.3 From d1c6e08e7503649e4a4f3f9e700e2c05300b6379 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 8 Sep 2021 22:11:37 -0700 Subject: libnvdimm/labels: Add uuid helpers In preparation for CXL labels that move the uuid to a different offset in the label, add nsl_{ref,get,validate}_uuid(). These helpers use the proper uuid_t type. That type definition predated the libnvdimm subsystem, so now is as a good a time as any to convert all the uuid handling in the subsystem to uuid_t to match the helpers. Note that the uuid fields in the label data and superblocks is not replaced per Andy's expectation that uuid_t is a kernel internal type not to appear in external ABI interfaces. So, in those case {import,export}_uuid() is used to go between the 2 types. Also note that this rework uncovered some unnecessary copies for label comparisons, those are cleaned up with nsl_uuid_equal(). As for the whitespace changes, all new code is clang-format compliant. Reported-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/163116429748.2460985.15659993454313919977.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/nd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nd.h b/include/linux/nd.h index ee9ad76afbba..8a8c63edb1b2 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -88,7 +88,7 @@ struct nd_namespace_pmem { struct nd_namespace_io nsio; unsigned long lbasize; char *alt_name; - u8 *uuid; + uuid_t *uuid; int id; }; @@ -105,7 +105,7 @@ struct nd_namespace_pmem { struct nd_namespace_blk { struct nd_namespace_common common; char *alt_name; - u8 *uuid; + uuid_t *uuid; int id; unsigned long lbasize; resource_size_t size; -- cgit v1.2.3 From 8c75d585b931ac874fbe4ee5a8f1811d20c2817f Mon Sep 17 00:00:00 2001 From: Deepak Kumar Singh Date: Tue, 31 Aug 2021 20:00:27 +0530 Subject: soc: qcom: aoss: Expose send for generic usecase Not all upcoming usecases will have an interface to allow the aoss driver to hook onto. Expose the send api and create a get function to enable drivers to send their own messages to aoss. Signed-off-by: Chris Lew Signed-off-by: Deepak Kumar Singh Reviewed-by: Stephen Boyd Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/1630420228-31075-2-git-send-email-deesin@codeaurora.org --- include/linux/soc/qcom/qcom_aoss.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 include/linux/soc/qcom/qcom_aoss.h (limited to 'include/linux') diff --git a/include/linux/soc/qcom/qcom_aoss.h b/include/linux/soc/qcom/qcom_aoss.h new file mode 100644 index 000000000000..3c2a82e606f8 --- /dev/null +++ b/include/linux/soc/qcom/qcom_aoss.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#ifndef __QCOM_AOSS_H__ +#define __QCOM_AOSS_H__ + +#include +#include + +struct qmp; + +#if IS_ENABLED(CONFIG_QCOM_AOSS_QMP) + +int qmp_send(struct qmp *qmp, const void *data, size_t len); +struct qmp *qmp_get(struct device *dev); +void qmp_put(struct qmp *qmp); + +#else + +static inline int qmp_send(struct qmp *qmp, const void *data, size_t len) +{ + return -ENODEV; +} + +static inline struct qmp *qmp_get(struct device *dev) +{ + return ERR_PTR(-ENODEV); +} + +static inline void qmp_put(struct qmp *qmp) +{ +} + +#endif + +#endif -- cgit v1.2.3 From 20c36ce2164f1774b487d443ece99b754bc6ad43 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Thu, 16 Sep 2021 10:52:03 +0800 Subject: irqdomain: Change the type of 'size' in __irq_domain_add() to be consistent The 'size' is used in struct_size(domain, revmap, size) and its input parameter type is 'size_t'(unsigned int). Changing the size to 'unsigned int' to make the type consistent. Signed-off-by: Bixuan Cui Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210916025203.44841-1-cuibixuan@huawei.com --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 23e4ee523576..9ee238ad29ce 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -251,7 +251,7 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) } void irq_domain_free_fwnode(struct fwnode_handle *fwnode); -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data); -- cgit v1.2.3 From a68de80f61f6af397bc06fb391ff2e571c9c4d80 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 1 Sep 2021 13:30:27 -0700 Subject: entry: rseq: Call rseq_handle_notify_resume() in tracehook_notify_resume() Invoke rseq_handle_notify_resume() from tracehook_notify_resume() now that the two function are always called back-to-back by architectures that have rseq. The rseq helper is stubbed out for architectures that don't support rseq, i.e. this is a nop across the board. Note, tracehook_notify_resume() is horribly named and arguably does not belong in tracehook.h as literally every line of code in it has nothing to do with tracing. But, that's been true since commit a42c6ded827d ("move key_repace_session_keyring() into tracehook_notify_resume()") first usurped tracehook_notify_resume() back in 2012. Punt cleaning that mess up to future patches. No functional change intended. Acked-by: Mathieu Desnoyers Signed-off-by: Sean Christopherson Message-Id: <20210901203030.1292304-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/tracehook.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 3e80c4bc66f7..2564b7434b4d 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -197,6 +197,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) mem_cgroup_handle_over_high(); blkcg_maybe_throttle_current(); + + rseq_handle_notify_resume(NULL, regs); } /* -- cgit v1.2.3 From 4eeef2424153e79910d65248b5e1abf137d050e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 10 Sep 2021 11:32:19 -0700 Subject: KVM: x86: Query vcpu->vcpu_idx directly and drop its accessor Read vcpu->vcpu_idx directly instead of bouncing through the one-line wrapper, kvm_vcpu_get_idx(), and drop the wrapper. The wrapper is a remnant of the original implementation and serves no purpose; remove it before it gains more users. Back when kvm_vcpu_get_idx() was added by commit 497d72d80a78 ("KVM: Add kvm_vcpu_get_idx to get vcpu index in kvm->vcpus"), the implementation was more than just a simple wrapper as vcpu->vcpu_idx did not exist and retrieving the index meant walking over the vCPU array to find the given vCPU. When vcpu_idx was introduced by commit 8750e72a79dd ("KVM: remember position in kvm->vcpus array"), the helper was left behind, likely to avoid extra thrash (but even then there were only two users, the original arm usage having been removed at some point in the past). No functional change intended. Suggested-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Reviewed-by: Vitaly Kuznetsov Message-Id: <20210910183220.2397812-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 041ca7f15ea4..000ea73dd324 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -721,11 +721,6 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) return NULL; } -static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) -{ - return vcpu->vcpu_idx; -} - #define kvm_for_each_memslot(memslot, slots) \ for (memslot = &slots->memslots[0]; \ memslot < slots->memslots + slots->used_slots; memslot++) \ -- cgit v1.2.3 From b468e688240b58ece498c430c377bb81b78a41f1 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 14 Sep 2021 11:11:20 +0200 Subject: tty: remove flags from struct tty_ldisc_ops The last user was apparently removed by commit a352def21a64 (tty: Ldisc revamp) in 2008. So remove the field completely, the only setter (n_tty_inherit_ops) and also its only possible value (LDISC_FLAG_DEFINED). Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20210914091134.17426-2-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_ldisc.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index b1d812e902aa..1f37184eee63 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -180,7 +180,6 @@ extern int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, struct tty_ldisc_ops { char *name; int num; - int flags; /* * The following routines are called from above. @@ -220,8 +219,6 @@ struct tty_ldisc { struct tty_struct *tty; }; -#define LDISC_FLAG_DEFINED 0x00000001 - #define MODULE_ALIAS_LDISC(ldisc) \ MODULE_ALIAS("tty-ldisc-" __stringify(ldisc)) -- cgit v1.2.3 From 7894193436b65f304ba790abe2532a7a3bbeb7c8 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 14 Sep 2021 11:11:21 +0200 Subject: tty: remove extern from functions in tty headers After the recent headers cleanup, some function declarations still have extern before them. It is superfluous (for function declarations), so remove extern from those which still have it. This unifies them with the rest of the files. Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20210914091134.17426-3-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 139 ++++++++++++++++++++++----------------------- include/linux/tty_driver.h | 8 +-- include/linux/tty_flip.h | 20 +++---- include/linux/tty_ldisc.h | 22 +++---- 4 files changed, 94 insertions(+), 95 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 168e57e40bbb..15453b0c4081 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -252,20 +252,20 @@ static inline bool tty_throttled(struct tty_struct *tty) } #ifdef CONFIG_TTY -extern void tty_kref_put(struct tty_struct *tty); -extern struct pid *tty_get_pgrp(struct tty_struct *tty); -extern void tty_vhangup_self(void); -extern void disassociate_ctty(int priv); -extern dev_t tty_devnum(struct tty_struct *tty); -extern void proc_clear_tty(struct task_struct *p); -extern struct tty_struct *get_current_tty(void); +void tty_kref_put(struct tty_struct *tty); +struct pid *tty_get_pgrp(struct tty_struct *tty); +void tty_vhangup_self(void); +void disassociate_ctty(int priv); +dev_t tty_devnum(struct tty_struct *tty); +void proc_clear_tty(struct task_struct *p); +struct tty_struct *get_current_tty(void); /* tty_io.c */ -extern int __init tty_init(void); -extern const char *tty_name(const struct tty_struct *tty); -extern struct tty_struct *tty_kopen_exclusive(dev_t device); -extern struct tty_struct *tty_kopen_shared(dev_t device); -extern void tty_kclose(struct tty_struct *tty); -extern int tty_dev_name_to_number(const char *name, dev_t *number); +int __init tty_init(void); +const char *tty_name(const struct tty_struct *tty); +struct tty_struct *tty_kopen_exclusive(dev_t device); +struct tty_struct *tty_kopen_shared(dev_t device); +void tty_kclose(struct tty_struct *tty); +int tty_dev_name_to_number(const char *name, dev_t *number); #else static inline void tty_kref_put(struct tty_struct *tty) { } @@ -296,7 +296,7 @@ static inline int tty_dev_name_to_number(const char *name, dev_t *number) extern struct ktermios tty_std_termios; -extern int vcs_init(void); +int vcs_init(void); extern struct class *tty_class; @@ -316,34 +316,34 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) return tty; } -extern const char *tty_driver_name(const struct tty_struct *tty); -extern void tty_wait_until_sent(struct tty_struct *tty, long timeout); -extern void stop_tty(struct tty_struct *tty); -extern void start_tty(struct tty_struct *tty); -extern void tty_write_message(struct tty_struct *tty, char *msg); -extern int tty_send_xchar(struct tty_struct *tty, char ch); -extern int tty_put_char(struct tty_struct *tty, unsigned char c); -extern unsigned int tty_chars_in_buffer(struct tty_struct *tty); -extern unsigned int tty_write_room(struct tty_struct *tty); -extern void tty_driver_flush_buffer(struct tty_struct *tty); -extern void tty_unthrottle(struct tty_struct *tty); -extern int tty_throttle_safe(struct tty_struct *tty); -extern int tty_unthrottle_safe(struct tty_struct *tty); -extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws); -extern int tty_get_icount(struct tty_struct *tty, - struct serial_icounter_struct *icount); -extern int is_current_pgrp_orphaned(void); -extern void tty_hangup(struct tty_struct *tty); -extern void tty_vhangup(struct tty_struct *tty); -extern int tty_hung_up_p(struct file *filp); -extern void do_SAK(struct tty_struct *tty); -extern void __do_SAK(struct tty_struct *tty); -extern void no_tty(void); -extern speed_t tty_termios_baud_rate(struct ktermios *termios); -extern void tty_termios_encode_baud_rate(struct ktermios *termios, - speed_t ibaud, speed_t obaud); -extern void tty_encode_baud_rate(struct tty_struct *tty, - speed_t ibaud, speed_t obaud); +const char *tty_driver_name(const struct tty_struct *tty); +void tty_wait_until_sent(struct tty_struct *tty, long timeout); +void stop_tty(struct tty_struct *tty); +void start_tty(struct tty_struct *tty); +void tty_write_message(struct tty_struct *tty, char *msg); +int tty_send_xchar(struct tty_struct *tty, char ch); +int tty_put_char(struct tty_struct *tty, unsigned char c); +unsigned int tty_chars_in_buffer(struct tty_struct *tty); +unsigned int tty_write_room(struct tty_struct *tty); +void tty_driver_flush_buffer(struct tty_struct *tty); +void tty_unthrottle(struct tty_struct *tty); +int tty_throttle_safe(struct tty_struct *tty); +int tty_unthrottle_safe(struct tty_struct *tty); +int tty_do_resize(struct tty_struct *tty, struct winsize *ws); +int tty_get_icount(struct tty_struct *tty, + struct serial_icounter_struct *icount); +int is_current_pgrp_orphaned(void); +void tty_hangup(struct tty_struct *tty); +void tty_vhangup(struct tty_struct *tty); +int tty_hung_up_p(struct file *filp); +void do_SAK(struct tty_struct *tty); +void __do_SAK(struct tty_struct *tty); +void no_tty(void); +speed_t tty_termios_baud_rate(struct ktermios *termios); +void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, + speed_t obaud); +void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, + speed_t obaud); /** * tty_get_baud_rate - get tty bit rates @@ -363,37 +363,37 @@ static inline speed_t tty_get_baud_rate(struct tty_struct *tty) unsigned char tty_get_char_size(unsigned int cflag); unsigned char tty_get_frame_size(unsigned int cflag); -extern void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old); -extern int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b); -extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); +void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old); +int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b); +int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); -extern void tty_wakeup(struct tty_struct *tty); +void tty_wakeup(struct tty_struct *tty); -extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file, +int tty_mode_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); -extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg); -extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); -extern void tty_release_struct(struct tty_struct *tty, int idx); -extern void tty_init_termios(struct tty_struct *tty); -extern void tty_save_termios(struct tty_struct *tty); -extern int tty_standard_install(struct tty_driver *driver, +int tty_perform_flush(struct tty_struct *tty, unsigned long arg); +struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); +void tty_release_struct(struct tty_struct *tty, int idx); +void tty_init_termios(struct tty_struct *tty); +void tty_save_termios(struct tty_struct *tty); +int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty); extern struct mutex tty_mutex; /* n_tty.c */ -extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops); +void n_tty_inherit_ops(struct tty_ldisc_ops *ops); #ifdef CONFIG_TTY -extern void __init n_tty_init(void); +void __init n_tty_init(void); #else static inline void n_tty_init(void) { } #endif /* tty_audit.c */ #ifdef CONFIG_AUDIT -extern void tty_audit_exit(void); -extern void tty_audit_fork(struct signal_struct *sig); -extern int tty_audit_push(void); +void tty_audit_exit(void); +void tty_audit_fork(struct signal_struct *sig); +int tty_audit_push(void); #else static inline void tty_audit_exit(void) { @@ -408,24 +408,23 @@ static inline int tty_audit_push(void) #endif /* tty_ioctl.c */ -extern int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file, - unsigned int cmd, unsigned long arg); +int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file, + unsigned int cmd, unsigned long arg); /* vt.c */ -extern int vt_ioctl(struct tty_struct *tty, - unsigned int cmd, unsigned long arg); +int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); -extern long vt_compat_ioctl(struct tty_struct *tty, - unsigned int cmd, unsigned long arg); +long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd, + unsigned long arg); /* tty_mutex.c */ /* functions for preparation of BKL removal */ -extern void tty_lock(struct tty_struct *tty); -extern int tty_lock_interruptible(struct tty_struct *tty); -extern void tty_unlock(struct tty_struct *tty); -extern void tty_lock_slave(struct tty_struct *tty); -extern void tty_unlock_slave(struct tty_struct *tty); -extern void tty_set_lock_subclass(struct tty_struct *tty); +void tty_lock(struct tty_struct *tty); +int tty_lock_interruptible(struct tty_struct *tty); +void tty_unlock(struct tty_struct *tty); +void tty_lock_slave(struct tty_struct *tty); +void tty_unlock_slave(struct tty_struct *tty); +void tty_set_lock_subclass(struct tty_struct *tty); #endif diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index c20431d8def8..29e1cf178afb 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -327,11 +327,11 @@ struct tty_driver { extern struct list_head tty_drivers; -extern struct tty_driver *__tty_alloc_driver(unsigned int lines, - struct module *owner, unsigned long flags); -extern struct tty_driver *tty_find_polling_driver(char *name, int *line); +struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner, + unsigned long flags); +struct tty_driver *tty_find_polling_driver(char *name, int *line); -extern void tty_driver_kref_put(struct tty_driver *driver); +void tty_driver_kref_put(struct tty_driver *driver); /* Use TTY_DRIVER_* flags below */ #define tty_alloc_driver(lines, flags) \ diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h index 32284992b31a..9916acb5de49 100644 --- a/include/linux/tty_flip.h +++ b/include/linux/tty_flip.h @@ -7,16 +7,16 @@ struct tty_ldisc; -extern int tty_buffer_set_limit(struct tty_port *port, int limit); -extern unsigned int tty_buffer_space_avail(struct tty_port *port); -extern int tty_buffer_request_room(struct tty_port *port, size_t size); -extern int tty_insert_flip_string_flags(struct tty_port *port, +int tty_buffer_set_limit(struct tty_port *port, int limit); +unsigned int tty_buffer_space_avail(struct tty_port *port); +int tty_buffer_request_room(struct tty_port *port, size_t size); +int tty_insert_flip_string_flags(struct tty_port *port, const unsigned char *chars, const char *flags, size_t size); -extern int tty_insert_flip_string_fixed_flag(struct tty_port *port, +int tty_insert_flip_string_fixed_flag(struct tty_port *port, const unsigned char *chars, char flag, size_t size); -extern int tty_prepare_flip_string(struct tty_port *port, - unsigned char **chars, size_t size); -extern void tty_flip_buffer_push(struct tty_port *port); +int tty_prepare_flip_string(struct tty_port *port, unsigned char **chars, + size_t size); +void tty_flip_buffer_push(struct tty_port *port); void tty_schedule_flip(struct tty_port *port); int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag); @@ -45,7 +45,7 @@ static inline int tty_insert_flip_string(struct tty_port *port, int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p, const char *f, int count); -extern void tty_buffer_lock_exclusive(struct tty_port *port); -extern void tty_buffer_unlock_exclusive(struct tty_port *port); +void tty_buffer_lock_exclusive(struct tty_port *port); +void tty_buffer_unlock_exclusive(struct tty_port *port); #endif /* _LINUX_TTY_FLIP_H */ diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index 1f37184eee63..4d1c128afbfa 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -146,7 +146,7 @@ struct ld_semaphore { #endif }; -extern void __init_ldsem(struct ld_semaphore *sem, const char *name, +void __init_ldsem(struct ld_semaphore *sem, const char *name, struct lock_class_key *key); #define init_ldsem(sem) \ @@ -157,18 +157,18 @@ do { \ } while (0) -extern int ldsem_down_read(struct ld_semaphore *sem, long timeout); -extern int ldsem_down_read_trylock(struct ld_semaphore *sem); -extern int ldsem_down_write(struct ld_semaphore *sem, long timeout); -extern int ldsem_down_write_trylock(struct ld_semaphore *sem); -extern void ldsem_up_read(struct ld_semaphore *sem); -extern void ldsem_up_write(struct ld_semaphore *sem); +int ldsem_down_read(struct ld_semaphore *sem, long timeout); +int ldsem_down_read_trylock(struct ld_semaphore *sem); +int ldsem_down_write(struct ld_semaphore *sem, long timeout); +int ldsem_down_write_trylock(struct ld_semaphore *sem); +void ldsem_up_read(struct ld_semaphore *sem); +void ldsem_up_write(struct ld_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC -extern int ldsem_down_read_nested(struct ld_semaphore *sem, int subclass, - long timeout); -extern int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, - long timeout); +int ldsem_down_read_nested(struct ld_semaphore *sem, int subclass, + long timeout); +int ldsem_down_write_nested(struct ld_semaphore *sem, int subclass, + long timeout); #else # define ldsem_down_read_nested(sem, subclass, timeout) \ ldsem_down_read(sem, timeout) -- cgit v1.2.3 From 28f194da4a2c4d431552025a4386edaffab181bd Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 14 Sep 2021 11:11:22 +0200 Subject: tty: make tty_ldisc_ops::hangup return void The documentation says that the return value of tty_ldisc_ops::hangup hook is ignored. And it really is, so there is no point for its return type to be int. Switch it to void and all the hooks too. Cc: Dmitry Torokhov Cc: Wolfgang Grandegger Cc: Marc Kleine-Budde Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Paul Mackerras Cc: Liam Girdwood Cc: Mark Brown Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Peter Ujfalusi Acked-by: Dmitry Torokhov Acked-by: Mark Brown Acked-by: Marc Kleine-Budde Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20210914091134.17426-4-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_ldisc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index 4d1c128afbfa..b85d84fb5f49 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -199,7 +199,7 @@ struct tty_ldisc_ops { void (*set_termios)(struct tty_struct *tty, struct ktermios *old); __poll_t (*poll)(struct tty_struct *, struct file *, struct poll_table_struct *); - int (*hangup)(struct tty_struct *tty); + void (*hangup)(struct tty_struct *tty); /* * The following routines are called from below. -- cgit v1.2.3 From dcc223e8b9bf3f987bcd3c327a6737022b39e35b Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 14 Sep 2021 11:11:23 +0200 Subject: tty: remove file from tty_mode_ioctl The only user of 'file' parameter in tty_mode_ioctl is a BUG_ON check. Provided it never crashed for anyone, it's an overkill to pass the parameter to tty_mode_ioctl only for this check. If we wanted to check 'file' there, we should handle it in more graceful way anyway. Not by a BUG == crash. Cc: Wolfgang Grandegger Cc: Marc Kleine-Budde Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Andreas Koensgen Cc: Paul Mackerras Acked-by: Marc Kleine-Budde Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20210914091134.17426-5-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 15453b0c4081..5e73f577c2b4 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -369,8 +369,7 @@ int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); void tty_wakeup(struct tty_struct *tty); -int tty_mode_ioctl(struct tty_struct *tty, struct file *file, - unsigned int cmd, unsigned long arg); +int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); int tty_perform_flush(struct tty_struct *tty, unsigned long arg); struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); void tty_release_struct(struct tty_struct *tty, int idx); -- cgit v1.2.3 From 7c783601a3bc22a54cce0fb650259a983c8cafba Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 14 Sep 2021 11:11:24 +0200 Subject: tty: remove file from n_tty_ioctl_helper After the previous patch, there are no users of 'file' in n_tty_ioctl_helper. So remove it also from there. Cc: Marcel Holtmann Cc: Johan Hedberg Cc: Luiz Augusto von Dentz Cc: Paul Mackerras Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Krzysztof Kozlowski Acked-by: Krzysztof Kozlowski Signed-off-by: Jiri Slaby Link: https://lore.kernel.org/r/20210914091134.17426-6-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 5e73f577c2b4..5dbd7c5afac7 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -407,8 +407,8 @@ static inline int tty_audit_push(void) #endif /* tty_ioctl.c */ -int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file, - unsigned int cmd, unsigned long arg); +int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd, + unsigned long arg); /* vt.c */ -- cgit v1.2.3 From 23c64d7618a7e76e46db99444d184ef75498fb71 Mon Sep 17 00:00:00 2001 From: Piyush Mehta Date: Wed, 22 Sep 2021 19:23:17 +0530 Subject: firmware: zynqmp: Add MMIO read and write support for PS_MODE pin Add Xilinx ZynqMP firmware MMIO APIs support to set and get PS_MODE pins value and status. These APIs create an interface path between mode pin controller driver and low-level API to access GPIO pins. Signed-off-by: Piyush Mehta Acked-by: Michal Simek Acked-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- include/linux/firmware/xlnx-zynqmp.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 56b426fe020c..3917f89b2b3c 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -72,6 +72,8 @@ enum pm_api_id { PM_SET_REQUIREMENT = 15, PM_RESET_ASSERT = 17, PM_RESET_GET_STATUS = 18, + PM_MMIO_WRITE = 19, + PM_MMIO_READ = 20, PM_PM_INIT_FINALIZE = 21, PM_FPGA_LOAD = 22, PM_FPGA_GET_STATUS = 23, @@ -390,6 +392,8 @@ int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type); int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, const enum zynqmp_pm_reset_action assert_flag); int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status); +unsigned int zynqmp_pm_bootmode_read(u32 *ps_mode); +int zynqmp_pm_bootmode_write(u32 ps_mode); int zynqmp_pm_init_finalize(void); int zynqmp_pm_set_suspend_mode(u32 mode); int zynqmp_pm_request_node(const u32 node, const u32 capabilities, @@ -520,6 +524,16 @@ static inline int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, return -ENODEV; } +static inline unsigned int zynqmp_pm_bootmode_read(u32 *ps_mode) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_bootmode_write(u32 ps_mode) +{ + return -ENODEV; +} + static inline int zynqmp_pm_init_finalize(void) { return -ENODEV; -- cgit v1.2.3 From 68a81bb2eebdeaabb04ae04c4d89c3bb5bec3b32 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 22 Sep 2021 16:57:03 +0300 Subject: net: dsa: sja1105: remove sp->dp It looks like this field was never used since its introduction in commit 227d07a07ef1 ("net: dsa: sja1105: Add support for traffic through standalone ports") remove it. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 171106202fe5..71b69ec52108 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -65,7 +65,6 @@ struct sja1105_port { struct kthread_work xmit_work; struct sk_buff_head xmit_queue; struct sja1105_tagger_data *data; - struct dsa_port *dp; bool hwts_tx_en; }; -- cgit v1.2.3 From 6d709cadfde68dbd12bef12fcced6222226dcb06 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 22 Sep 2021 17:37:25 +0300 Subject: net: dsa: move sja1110_process_meta_tstamp inside the tagging protocol driver The problem is that DSA tagging protocols really must not depend on the switch driver, because this creates a circular dependency at insmod time, and the switch driver will effectively not load when the tagging protocol driver is missing. The code was structured in the way it was for a reason, though. The DSA driver-facing API for PTP timestamping relies on the assumption that two-step TX timestamps are provided by the hardware in an out-of-band manner, typically by raising an interrupt and making that timestamp available inside some sort of FIFO which is to be accessed over SPI/MDIO/etc. So the API puts .port_txtstamp into dsa_switch_ops, because it is expected that the switch driver needs to save some state (like put the skb into a queue until its TX timestamp arrives). On SJA1110, TX timestamps are provided by the switch as Ethernet packets, so this makes them be received and processed by the tagging protocol driver. This in itself is great, because the timestamps are full 64-bit and do not require reconstruction, and since Ethernet is the fastest I/O method available to/from the switch, PTP timestamps arrive very quickly, no matter how bottlenecked the SPI connection is, because SPI interaction is not needed at all. DSA's code structure and strict isolation between the tagging protocol driver and the switch driver break the natural code organization. When the tagging protocol driver receives a packet which is classified as a metadata packet containing timestamps, it passes those timestamps one by one to the switch driver, which then proceeds to compare them based on the recorded timestamp ID that was generated in .port_txtstamp. The communication between the tagging protocol and the switch driver is done through a method exported by the switch driver, sja1110_process_meta_tstamp. To satisfy build requirements, we force a dependency to build the tagging protocol driver as a module when the switch driver is a module. However, as explained in the first paragraph, that causes the circular dependency. To solve this, move the skb queue from struct sja1105_private :: struct sja1105_ptp_data to struct sja1105_private :: struct sja1105_tagger_data. The latter is a data structure for which hacks have already been put into place to be able to create persistent storage per switch that is accessible from the tagging protocol driver (see sja1105_setup_ports). With the skb queue directly accessible from the tagging protocol driver, we can now move sja1110_process_meta_tstamp into the tagging driver itself, and avoid exporting a symbol. Fixes: 566b18c8b752 ("net: dsa: sja1105: implement TX timestamping for SJA1110") Link: https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/ Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 71b69ec52108..9d5a1053b276 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -48,6 +48,10 @@ struct sja1105_tagger_data { spinlock_t meta_lock; unsigned long state; u8 ts_id; + /* Used on SJA1110 where meta frames are generated only for + * 2-step TX timestamps + */ + struct sk_buff_head skb_txtstamp_queue; }; struct sja1105_skb_cb { @@ -68,25 +72,20 @@ struct sja1105_port { bool hwts_tx_en; }; -enum sja1110_meta_tstamp { - SJA1110_META_TSTAMP_TX = 0, - SJA1110_META_TSTAMP_RX = 1, -}; - -#if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) - -void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id, - enum sja1110_meta_tstamp dir, u64 tstamp); - -#else +/* Timestamps are in units of 8 ns clock ticks (equivalent to + * a fixed 125 MHz clock). + */ +#define SJA1105_TICK_NS 8 -static inline void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, - u8 ts_id, enum sja1110_meta_tstamp dir, - u64 tstamp) +static inline s64 ns_to_sja1105_ticks(s64 ns) { + return ns / SJA1105_TICK_NS; } -#endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */ +static inline s64 sja1105_ticks_to_ns(s64 ticks) +{ + return ticks * SJA1105_TICK_NS; +} #if IS_ENABLED(CONFIG_NET_DSA_SJA1105) -- cgit v1.2.3 From f5aef4241592765a868611509e5170bbe8c89ae7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 22 Sep 2021 17:37:26 +0300 Subject: net: dsa: sja1105: break dependency between dsa_port_is_sja1105 and switch driver It's nice to be able to test a tagging protocol with dsa_loop, but not at the cost of losing the ability of building the tagging protocol and switch driver as modules, because as things stand, there is a circular dependency between the two. Tagging protocol drivers cannot depend on switch drivers, that is a hard fact. The reasoning behind the blamed patch was that accessing dp->priv should first make sure that the structure behind that pointer is what we really think it is. Currently the "sja1105" and "sja1110" tagging protocols only operate with the sja1105 switch driver, just like any other tagging protocol and switch combination. The only way to mix and match them is by modifying the code, and this applies to dsa_loop as well (by default that uses DSA_TAG_PROTO_NONE). So while in principle there is an issue, in practice there isn't one. Until we extend dsa_loop to allow user space configuration, treat the problem as a non-issue and just say that DSA ports found by tag_sja1105 are always sja1105 ports, which is in fact true. But keep the dsa_port_is_sja1105 function so that it's easy to patch it during testing, and rely on dead code elimination. Fixes: 994d2cbb08ca ("net: dsa: tag_sja1105: be dsa_loop-safe") Link: https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/ Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 9d5a1053b276..e6c78be40bde 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -87,22 +87,9 @@ static inline s64 sja1105_ticks_to_ns(s64 ticks) return ticks * SJA1105_TICK_NS; } -#if IS_ENABLED(CONFIG_NET_DSA_SJA1105) - -extern const struct dsa_switch_ops sja1105_switch_ops; - -static inline bool dsa_port_is_sja1105(struct dsa_port *dp) -{ - return dp->ds->ops == &sja1105_switch_ops; -} - -#else - static inline bool dsa_port_is_sja1105(struct dsa_port *dp) { - return false; + return true; } -#endif - #endif /* _NET_DSA_SJA1105_H */ -- cgit v1.2.3 From 6bc6db000295332bae2c1e8815d7450b72923d23 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sat, 18 Sep 2021 08:56:29 +0800 Subject: KVM: Remove tlbs_dirty There is no user of tlbs_dirty. Signed-off-by: Lai Jiangshan Signed-off-by: Paolo Bonzini Message-Id: <20210918005636.3675-4-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 000ea73dd324..0f18df7fe874 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -608,7 +608,6 @@ struct kvm { unsigned long mmu_notifier_range_start; unsigned long mmu_notifier_range_end; #endif - long tlbs_dirty; struct list_head devices; u64 manual_dirty_log_protect; struct dentry *debugfs_dentry; -- cgit v1.2.3 From d50497c4a05e73e76874fd0d4942036375a7ec0f Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Wed, 15 Sep 2021 18:46:27 -0700 Subject: platform/chrome: cros_ec_proto: Fix check_features ret val The kerneldoc for cros_ec_check_features() states that it returns 1 or 0 depedending on whether a feature is supported or not, but it instead returns a negative error number in one case, and a non-1 bitmask in other cases. Since all call-sites only check for a 1 or 0 return value, update the function to return boolean values. Signed-off-by: Prashant Malani Reviewed-by: Guenter Roeck Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210916014632.2662612-1-pmalani@chromium.org --- include/linux/platform_data/cros_ec_proto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 02599687770c..55844ece0b32 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -227,7 +227,7 @@ int cros_ec_get_next_event(struct cros_ec_device *ec_dev, u32 cros_ec_get_host_event(struct cros_ec_device *ec_dev); -int cros_ec_check_features(struct cros_ec_dev *ec, int feature); +bool cros_ec_check_features(struct cros_ec_dev *ec, int feature); int cros_ec_get_sensor_count(struct cros_ec_dev *ec); -- cgit v1.2.3 From 5501765a02a6c324f78581e6bb8209d054fe13ae Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Wed, 15 Sep 2021 10:09:38 -0700 Subject: driver core: fw_devlink: Add support for FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD If a parent device is also a supplier to a child device, fw_devlink=on by design delays the probe() of the child device until the probe() of the parent finishes successfully. However, some drivers of such parent devices (where parent is also a supplier) expect the child device to finish probing successfully as soon as they are added using device_add() and before the probe() of the parent device has completed successfully. One example of such a case is discussed in the link mentioned below. Add a flag to make fw_devlink=on not enforce these supplier-consumer relationships, so these drivers can continue working. Link: https://lore.kernel.org/netdev/CAGETcx_uj0V4DChME-gy5HGKTYnxLBX=TH2rag29f_p=UcG+Tg@mail.gmail.com/ Fixes: ea718c699055 ("Revert "Revert "driver core: Set fw_devlink=on by default""") Signed-off-by: Saravana Kannan Link: https://lore.kernel.org/r/20210915170940.617415-3-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 59828516ebaf..9f4ad719bfe3 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -22,10 +22,15 @@ struct device; * LINKS_ADDED: The fwnode has already be parsed to add fwnode links. * NOT_DEVICE: The fwnode will never be populated as a struct device. * INITIALIZED: The hardware corresponding to fwnode has been initialized. + * NEEDS_CHILD_BOUND_ON_ADD: For this fwnode/device to probe successfully, its + * driver needs its child devices to be bound with + * their respective drivers as soon as they are + * added. */ -#define FWNODE_FLAG_LINKS_ADDED BIT(0) -#define FWNODE_FLAG_NOT_DEVICE BIT(1) -#define FWNODE_FLAG_INITIALIZED BIT(2) +#define FWNODE_FLAG_LINKS_ADDED BIT(0) +#define FWNODE_FLAG_NOT_DEVICE BIT(1) +#define FWNODE_FLAG_INITIALIZED BIT(2) +#define FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD BIT(3) struct fwnode_handle { struct fwnode_handle *secondary; -- cgit v1.2.3 From 3fd445a4d49fce594eecc90b9bbcf85cd223154d Mon Sep 17 00:00:00 2001 From: Len Baker Date: Sat, 4 Sep 2021 11:22:17 +0200 Subject: brcmfmac: Replace zero-length array with flexible array member There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use "flexible array members"[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Also, make use of the struct_size() helper in devm_kzalloc(). [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.14/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Len Baker Reviewed-by: Gustavo A. R. Silva Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210904092217.2848-1-len.baker@gmx.com --- include/linux/platform_data/brcmfmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/brcmfmac.h b/include/linux/platform_data/brcmfmac.h index 1d30bf278231..2b5676ff35be 100644 --- a/include/linux/platform_data/brcmfmac.h +++ b/include/linux/platform_data/brcmfmac.h @@ -125,7 +125,7 @@ struct brcmfmac_pd_cc_entry { */ struct brcmfmac_pd_cc { int table_size; - struct brcmfmac_pd_cc_entry table[0]; + struct brcmfmac_pd_cc_entry table[]; }; /** -- cgit v1.2.3 From ae98f40d32cd0ee6fc222e765734ffa497a0a95e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 23 Sep 2021 13:57:32 -0700 Subject: net: phy: broadcom: Fix PHY_BRCM_IDDQ_SUSPEND definition An extraneous number was added during the inclusion of that change, correct that such that we use a single bit as is expected by the PHY driver. Reported-by: Justin Chen Fixes: d6da08ed1425 ("net: phy: broadcom: Add IDDQ-SR mode") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index b119d6819d6c..27d9b6683f0e 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -67,7 +67,7 @@ #define PHY_BRCM_CLEAR_RGMII_MODE 0x00000004 #define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00000008 #define PHY_BRCM_EN_MASTER_MODE 0x00000010 -#define PHY_BRCM_IDDQ_SUSPEND 0x000000220 +#define PHY_BRCM_IDDQ_SUSPEND 0x00000020 /* Broadcom BCM7xxx specific workarounds */ #define PHY_BRCM_7XXX_REV(x) (((x) >> 8) & 0xff) -- cgit v1.2.3 From 243418e3925d5b5b0657ae54c322d43035e97eed Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 24 Sep 2021 15:43:47 -0700 Subject: mm: fs: invalidate bh_lrus for only cold path The kernel test robot reported the regression of fio.write_iops[1] with commit 8cc621d2f45d ("mm: fs: invalidate BH LRU during page migration"). Since lru_add_drain is called frequently, invalidate bh_lrus there could increase bh_lrus cache miss ratio, which needs more IO in the end. This patch moves the bh_lrus invalidation from the hot path( e.g., zap_page_range, pagevec_release) to cold path(i.e., lru_add_drain_all, lru_cache_disable). Zhengjun Xing confirmed "I test the patch, the regression reduced to -2.9%" [1] https://lore.kernel.org/lkml/20210520083144.GD14190@xsang-OptiPlex-9020/ [2] 8cc621d2f45d, mm: fs: invalidate BH LRU during page migration Link: https://lkml.kernel.org/r/20210907212347.1977686-1-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: kernel test robot Reviewed-by: Chris Goldsworthy Tested-by: "Xing, Zhengjun" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6486d3c19463..36f33685c8c0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -194,7 +194,7 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size, struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); -void invalidate_bh_lrus_cpu(int cpu); +void invalidate_bh_lrus_cpu(void); bool has_bh_in_lru(int cpu, void *dummy); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); @@ -408,7 +408,7 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } -static inline void invalidate_bh_lrus_cpu(int cpu) {} +static inline void invalidate_bh_lrus_cpu(void) {} static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 -- cgit v1.2.3 From 57ed7b4303a1c4d1885019fef03e6a5af2e8468a Mon Sep 17 00:00:00 2001 From: Weizhao Ouyang Date: Fri, 24 Sep 2021 15:43:53 -0700 Subject: mm/debug: sync up latest migrate_reason to migrate_reason_names Sync up MR_DEMOTION to migrate_reason_names and add a synch prompt. Link: https://lkml.kernel.org/r/20210921064553.293905-3-o451686892@gmail.com Fixes: 26aa2d199d6f ("mm/migrate: demote pages during reclaim") Signed-off-by: Weizhao Ouyang Reviewed-by: "Huang, Ying" Reviewed-by: John Hubbard Cc: Anshuman Khandual Cc: Michal Hocko Cc: Pavel Tatashin Cc: Yang Shi Cc: Zi Yan Cc: Dave Hansen Cc: Minchan Kim Cc: Mina Almasry Cc: "Matthew Wilcox (Oracle)" Cc: Oscar Salvador Cc: Wei Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 326250996b4e..c8077e936691 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -19,6 +19,11 @@ struct migration_target_control; */ #define MIGRATEPAGE_SUCCESS 0 +/* + * Keep sync with: + * - macro MIGRATE_REASON in include/trace/events/migrate.h + * - migrate_reason_names[MR_TYPES] in mm/debug.c + */ enum migrate_reason { MR_COMPACTION, MR_MEMORY_FAILURE, @@ -32,7 +37,6 @@ enum migrate_reason { MR_TYPES }; -/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION -- cgit v1.2.3 From e7f18c22e6bea258ffd65185fdab66d1e63dd5bd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Aug 2021 13:42:43 -0700 Subject: stddef: Fix kerndoc for sizeof_field() and offsetofend() Adjust the comment styles so these are correctly identified as valid kern-doc. Signed-off-by: Kees Cook --- include/linux/stddef.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stddef.h b/include/linux/stddef.h index 998a4ba28eba..8553b33143d1 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -20,7 +20,7 @@ enum { #endif /** - * sizeof_field(TYPE, MEMBER) + * sizeof_field() - Report the size of a struct field in bytes * * @TYPE: The structure containing the field of interest * @MEMBER: The field to return the size of @@ -28,7 +28,7 @@ enum { #define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) /** - * offsetofend(TYPE, MEMBER) + * offsetofend() - Report the offset of a struct field within the struct * * @TYPE: The type of the structure * @MEMBER: The member within the structure to get the end offset of -- cgit v1.2.3 From 50d7bd38c3aafc4749e05e8d7fcb616979143602 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 17 May 2021 20:01:15 -0700 Subject: stddef: Introduce struct_group() helper macro Kernel code has a regular need to describe groups of members within a structure usually when they need to be copied or initialized separately from the rest of the surrounding structure. The generally accepted design pattern in C is to use a named sub-struct: struct foo { int one; struct { int two; int three, four; } thing; int five; }; This would allow for traditional references and sizing: memcpy(&dst.thing, &src.thing, sizeof(dst.thing)); However, doing this would mean that referencing struct members enclosed by such named structs would always require including the sub-struct name in identifiers: do_something(dst.thing.three); This has tended to be quite inflexible, especially when such groupings need to be added to established code which causes huge naming churn. Three workarounds exist in the kernel for this problem, and each have other negative properties. To avoid the naming churn, there is a design pattern of adding macro aliases for the named struct: #define f_three thing.three This ends up polluting the global namespace, and makes it difficult to search for identifiers. Another common work-around in kernel code avoids the pollution by avoiding the named struct entirely, instead identifying the group's boundaries using either a pair of empty anonymous structs of a pair of zero-element arrays: struct foo { int one; struct { } start; int two; int three, four; struct { } finish; int five; }; struct foo { int one; int start[0]; int two; int three, four; int finish[0]; int five; }; This allows code to avoid needing to use a sub-struct named for member references within the surrounding structure, but loses the benefits of being able to actually use such a struct, making it rather fragile. Using these requires open-coded calculation of sizes and offsets. The efforts made to avoid common mistakes include lots of comments, or adding various BUILD_BUG_ON()s. Such code is left with no way for the compiler to reason about the boundaries (e.g. the "start" object looks like it's 0 bytes in length), making bounds checking depend on open-coded calculations: if (length > offsetof(struct foo, finish) - offsetof(struct foo, start)) return -EINVAL; memcpy(&dst.start, &src.start, offsetof(struct foo, finish) - offsetof(struct foo, start)); However, the vast majority of places in the kernel that operate on groups of members do so without any identification of the grouping, relying either on comments or implicit knowledge of the struct contents, which is even harder for the compiler to reason about, and results in even more fragile manual sizing, usually depending on member locations outside of the region (e.g. to copy "two" and "three", use the start of "four" to find the size): BUILD_BUG_ON((offsetof(struct foo, four) < offsetof(struct foo, two)) || (offsetof(struct foo, four) < offsetof(struct foo, three)); if (length > offsetof(struct foo, four) - offsetof(struct foo, two)) return -EINVAL; memcpy(&dst.two, &src.two, length); In order to have a regular programmatic way to describe a struct region that can be used for references and sizing, can be examined for bounds checking, avoids forcing the use of intermediate identifiers, and avoids polluting the global namespace, introduce the struct_group() macro. This macro wraps the member declarations to create an anonymous union of an anonymous struct (no intermediate name) and a named struct (for references and sizing): struct foo { int one; struct_group(thing, int two; int three, four; ); int five; }; if (length > sizeof(src.thing)) return -EINVAL; memcpy(&dst.thing, &src.thing, length); do_something(dst.three); There are some rare cases where the resulting struct_group() needs attributes added, so struct_group_attr() is also introduced to allow for specifying struct attributes (e.g. __align(x) or __packed). Additionally, there are places where such declarations would like to have the struct be tagged, so struct_group_tagged() is added. Given there is a need for a handful of UAPI uses too, the underlying __struct_group() macro has been defined in UAPI so it can be used there too. To avoid confusing scripts/kernel-doc, hide the macro from its struct parsing. Co-developed-by: Keith Packard Signed-off-by: Keith Packard Acked-by: Gustavo A. R. Silva Link: https://lore.kernel.org/lkml/20210728023217.GC35706@embeddedor Enhanced-by: Rasmus Villemoes Link: https://lore.kernel.org/lkml/41183a98-bdb9-4ad6-7eab-5a7292a6df84@rasmusvillemoes.dk Enhanced-by: Dan Williams Link: https://lore.kernel.org/lkml/1d9a2e6df2a9a35b2cdd50a9a68cac5991e7e5f0.camel@intel.com Enhanced-by: Daniel Vetter Link: https://lore.kernel.org/lkml/YQKa76A6XuFqgM03@phenom.ffwll.local Acked-by: Dan Williams Signed-off-by: Kees Cook --- include/linux/stddef.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stddef.h b/include/linux/stddef.h index 8553b33143d1..8b103a53b000 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -36,4 +36,52 @@ enum { #define offsetofend(TYPE, MEMBER) \ (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) +/** + * struct_group() - Wrap a set of declarations in a mirrored struct + * + * @NAME: The identifier name of the mirrored sub-struct + * @MEMBERS: The member declarations for the mirrored structs + * + * Used to create an anonymous union of two structs with identical + * layout and size: one anonymous and one named. The former can be + * used normally without sub-struct naming, and the latter can be + * used to reason about the start, end, and size of the group of + * struct members. + */ +#define struct_group(NAME, MEMBERS...) \ + __struct_group(/* no tag */, NAME, /* no attrs */, MEMBERS) + +/** + * struct_group_attr() - Create a struct_group() with trailing attributes + * + * @NAME: The identifier name of the mirrored sub-struct + * @ATTRS: Any struct attributes to apply + * @MEMBERS: The member declarations for the mirrored structs + * + * Used to create an anonymous union of two structs with identical + * layout and size: one anonymous and one named. The former can be + * used normally without sub-struct naming, and the latter can be + * used to reason about the start, end, and size of the group of + * struct members. Includes structure attributes argument. + */ +#define struct_group_attr(NAME, ATTRS, MEMBERS...) \ + __struct_group(/* no tag */, NAME, ATTRS, MEMBERS) + +/** + * struct_group_tagged() - Create a struct_group with a reusable tag + * + * @TAG: The tag name for the named sub-struct + * @NAME: The identifier name of the mirrored sub-struct + * @MEMBERS: The member declarations for the mirrored structs + * + * Used to create an anonymous union of two structs with identical + * layout and size: one anonymous and one named. The former can be + * used normally without sub-struct naming, and the latter can be + * used to reason about the start, end, and size of the group of + * struct members. Includes struct tag argument for the named copy, + * so the specified layout can be reused later. + */ +#define struct_group_tagged(TAG, NAME, MEMBERS...) \ + __struct_group(TAG, NAME, /* no attrs */, MEMBERS) + #endif -- cgit v1.2.3 From c80d92fbb67b2c80b8eeb8759ee79d676eb33520 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Jun 2021 22:48:05 -0700 Subject: compiler_types.h: Remove __compiletime_object_size() Since all compilers support __builtin_object_size(), and there is only one user of __compiletime_object_size, remove it to avoid the needless indirection. This lets Clang reason about check_copy_size() correctly. Link: https://github.com/ClangBuiltLinux/linux/issues/1179 Suggested-by: Nick Desaulniers Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Sedat Dilek Cc: Will Deacon Cc: Marco Elver Cc: Arvind Sankar Cc: Masahiro Yamada Cc: Luc Van Oostenryck Cc: Andrew Morton Cc: Sami Tolvanen Cc: Thomas Gleixner Cc: Gabriel Krisman Bertazi Cc: Andy Lutomirski Cc: Oleg Nesterov Reviewed-by: Miguel Ojeda Signed-off-by: Kees Cook --- include/linux/compiler-gcc.h | 2 -- include/linux/compiler_types.h | 5 ----- include/linux/thread_info.h | 2 +- 3 files changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index bd2b881c6b63..9957085b8148 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -41,8 +41,6 @@ #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) -#define __compiletime_object_size(obj) __builtin_object_size(obj, 0) - #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) #define __latent_entropy __attribute__((latent_entropy)) #endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index b6ff83a714ca..05ceb2e92b0e 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -290,11 +290,6 @@ struct ftrace_likely_data { (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) -/* Compile time object size, -1 for unknown */ -#ifndef __compiletime_object_size -# define __compiletime_object_size(obj) -1 -#endif - #ifdef __OPTIMIZE__ # define __compiletime_assert(condition, msg, prefix, suffix) \ do { \ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 0999f6317978..ad0c4e041030 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -203,7 +203,7 @@ static inline void copy_overflow(int size, unsigned long count) static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { - int sz = __compiletime_object_size(addr); + int sz = __builtin_object_size(addr, 0); if (unlikely(sz >= 0 && sz < bytes)) { if (!__builtin_constant_p(bytes)) copy_overflow(sz, bytes); -- cgit v1.2.3 From c430f60036af44079170ff71a461b9d7cf5ee431 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 14 Apr 2021 15:45:39 -0700 Subject: fortify: Move remaining fortify helpers into fortify-string.h When commit a28a6e860c6c ("string.h: move fortified functions definitions in a dedicated header.") moved the fortify-specific code, some helpers were left behind. Move the remaining fortify-specific helpers into fortify-string.h so they're together where they're used. This requires that any FORTIFY helper function prototypes be conditionally built to avoid "no prototype" warnings. Additionally removes unused helpers. Cc: Andrew Morton Cc: Daniel Axtens Cc: Vincenzo Frascino Cc: Andrey Konovalov Cc: Dan Williams Acked-by: Francis Laniel Reviewed-by: Nick Desaulniers Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 7 +++++++ include/linux/string.h | 9 --------- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index c1be37437e77..7e67d02764db 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -2,6 +2,13 @@ #ifndef _LINUX_FORTIFY_STRING_H_ #define _LINUX_FORTIFY_STRING_H_ +#define __FORTIFY_INLINE extern __always_inline __attribute__((gnu_inline)) +#define __RENAME(x) __asm__(#x) + +void fortify_panic(const char *name) __noreturn __cold; +void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)"); +void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)"); +void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)"); #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); diff --git a/include/linux/string.h b/include/linux/string.h index 5e96d656be7a..ac1c769a5a80 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -249,15 +249,6 @@ static inline const char *kbasename(const char *path) return tail ? tail + 1 : path; } -#define __FORTIFY_INLINE extern __always_inline __attribute__((gnu_inline)) -#define __RENAME(x) __asm__(#x) - -void fortify_panic(const char *name) __noreturn __cold; -void __read_overflow(void) __compiletime_error("detected read beyond size of object passed as 1st parameter"); -void __read_overflow2(void) __compiletime_error("detected read beyond size of object passed as 2nd parameter"); -void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); -void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); - #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) #include #endif -- cgit v1.2.3 From 072af0c638dc8a5c7db2edc4dddbd6d44bee3bdb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 2 Aug 2021 10:25:01 -0700 Subject: fortify: Fix dropped strcpy() compile-time write overflow check The implementation for intra-object overflow in str*-family functions accidentally dropped compile-time write overflow checking in strcpy(), leaving it entirely to run-time. Add back the intended check. Fixes: 6a39e62abbaf ("lib: string.h: detect intra-object overflow in fortified string functions") Cc: Daniel Axtens Cc: Francis Laniel Signed-off-by: Kees Cook Reviewed-by: Nick Desaulniers --- include/linux/fortify-string.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 7e67d02764db..68bc5978d916 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -287,7 +287,10 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q) if (p_size == (size_t)-1 && q_size == (size_t)-1) return __underlying_strcpy(p, q); size = strlen(q) + 1; - /* test here to use the more stringent object size */ + /* Compile-time check for const size overflow. */ + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + /* Run-time check for dynamic size overflow. */ if (p_size < size) fortify_panic(__func__); memcpy(p, q, size); -- cgit v1.2.3 From 369cd2165d7beac1db144b40811baa2c6b7d8c54 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Aug 2021 14:20:14 -0700 Subject: fortify: Prepare to improve strnlen() and strlen() warnings In order to have strlen() use fortified strnlen() internally, swap their positions in the source. Doing this as part of later changes makes review difficult, so reoroder it here; no code changes. Cc: Francis Laniel Signed-off-by: Kees Cook Reviewed-by: Nick Desaulniers --- include/linux/fortify-string.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 68bc5978d916..a3cb1d9aacce 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -56,6 +56,17 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q) return p; } +extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); +__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) +{ + size_t p_size = __builtin_object_size(p, 1); + __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + + if (p_size <= ret && maxlen != ret) + fortify_panic(__func__); + return ret; +} + __FORTIFY_INLINE __kernel_size_t strlen(const char *p) { __kernel_size_t ret; @@ -71,17 +82,6 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) return ret; } -extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); -__FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) -{ - size_t p_size = __builtin_object_size(p, 1); - __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); - - if (p_size <= ret && maxlen != ret) - fortify_panic(__func__); - return ret; -} - /* defined after fortified strlen to reuse it */ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) -- cgit v1.2.3 From 3009f891bb9f328945ebd5b71e12df7e2467f3dd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 2 Aug 2021 22:51:31 -0700 Subject: fortify: Allow strlen() and strnlen() to pass compile-time known lengths Under CONFIG_FORTIFY_SOURCE, it is possible for the compiler to perform strlen() and strnlen() at compile-time when the string size is known. This is required to support compile-time overflow checking in strlcpy(). Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 49 ++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index a3cb1d9aacce..fdb0a74c9ca2 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -10,6 +10,20 @@ void __read_overflow(void) __compiletime_error("detected read beyond size of obj void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)"); void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)"); +#define __compiletime_strlen(p) \ +({ \ + unsigned char *__p = (unsigned char *)(p); \ + size_t ret = (size_t)-1; \ + size_t p_size = __builtin_object_size(p, 1); \ + if (p_size != (size_t)-1) { \ + size_t p_len = p_size - 1; \ + if (__builtin_constant_p(__p[p_len]) && \ + __p[p_len] == '\0') \ + ret = __builtin_strlen(__p); \ + } \ + ret; \ +}) + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); @@ -60,21 +74,31 @@ extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(st __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) { size_t p_size = __builtin_object_size(p, 1); - __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + size_t p_len = __compiletime_strlen(p); + size_t ret; + + /* We can take compile-time actions when maxlen is const. */ + if (__builtin_constant_p(maxlen) && p_len != (size_t)-1) { + /* If p is const, we can use its compile-time-known len. */ + if (maxlen >= p_size) + return p_len; + } + /* Do not check characters beyond the end of p. */ + ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); if (p_size <= ret && maxlen != ret) fortify_panic(__func__); return ret; } +/* defined after fortified strnlen to reuse it. */ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) { __kernel_size_t ret; size_t p_size = __builtin_object_size(p, 1); - /* Work around gcc excess stack consumption issue */ - if (p_size == (size_t)-1 || - (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) + /* Give up if we don't know how large p is. */ + if (p_size == (size_t)-1) return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) @@ -86,24 +110,27 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) { - size_t ret; size_t p_size = __builtin_object_size(p, 1); size_t q_size = __builtin_object_size(q, 1); + size_t q_len; /* Full count of source string length. */ + size_t len; /* Count of characters going into destination. */ if (p_size == (size_t)-1 && q_size == (size_t)-1) return __real_strlcpy(p, q, size); - ret = strlen(q); - if (size) { - size_t len = (ret >= size) ? size - 1 : ret; - - if (__builtin_constant_p(len) && len >= p_size) + q_len = strlen(q); + len = (q_len >= size) ? size - 1 : q_len; + if (__builtin_constant_p(size) && __builtin_constant_p(q_len) && size) { + /* Write size is always larger than destination. */ + if (len >= p_size) __write_overflow(); + } + if (size) { if (len >= p_size) fortify_panic(__func__); __underlying_memcpy(p, q, len); p[len] = '\0'; } - return ret; + return q_len; } /* defined after fortified strnlen to reuse it */ -- cgit v1.2.3 From 405fca8a946168e71c04b82cc80727c3ea686e08 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Fri, 24 Sep 2021 06:00:47 -0400 Subject: ieee80211: add power type definition for 6 GHz 6 GHz regulatory domains introduces different modes for 6 GHz AP operations: Low Power Indoor (LPI), Standard Power (SP) and Very Low Power (VLP). 6 GHz STAs could be operated as either Regular or Subordinate clients. Define the flags for power type of AP and STATION mode. Signed-off-by: Wen Gong Link: https://lore.kernel.org/r/20210924100052.32029-2-wgong@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 694264503119..420dea9aa648 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1988,6 +1988,44 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, int mcs, bool ext_nss_bw_capable, unsigned int max_vht_nss); +/** + * enum ieee80211_ap_reg_power - regulatory power for a Access Point + * + * @IEEE80211_REG_UNSET_AP: Access Point has no regulatory power mode + * @IEEE80211_REG_LPI: Indoor Access Point + * @IEEE80211_REG_SP: Standard power Access Point + * @IEEE80211_REG_VLP: Very low power Access Point + * @IEEE80211_REG_AP_POWER_AFTER_LAST: internal + * @IEEE80211_REG_AP_POWER_MAX: maximum value + */ +enum ieee80211_ap_reg_power { + IEEE80211_REG_UNSET_AP, + IEEE80211_REG_LPI_AP, + IEEE80211_REG_SP_AP, + IEEE80211_REG_VLP_AP, + IEEE80211_REG_AP_POWER_AFTER_LAST, + IEEE80211_REG_AP_POWER_MAX = + IEEE80211_REG_AP_POWER_AFTER_LAST - 1, +}; + +/** + * enum ieee80211_client_reg_power - regulatory power for a client + * + * @IEEE80211_REG_UNSET_CLIENT: Client has no regulatory power mode + * @IEEE80211_REG_DEFAULT_CLIENT: Default Client + * @IEEE80211_REG_SUBORDINATE_CLIENT: Subordinate Client + * @IEEE80211_REG_CLIENT_POWER_AFTER_LAST: internal + * @IEEE80211_REG_CLIENT_POWER_MAX: maximum value + */ +enum ieee80211_client_reg_power { + IEEE80211_REG_UNSET_CLIENT, + IEEE80211_REG_DEFAULT_CLIENT, + IEEE80211_REG_SUBORDINATE_CLIENT, + IEEE80211_REG_CLIENT_POWER_AFTER_LAST, + IEEE80211_REG_CLIENT_POWER_MAX = + IEEE80211_REG_CLIENT_POWER_AFTER_LAST - 1, +}; + /* 802.11ax HE MAC capabilities */ #define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 #define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 -- cgit v1.2.3 From 930dfa563155179861470b2aba880eac2ae30bfb Mon Sep 17 00:00:00 2001 From: Min Li Date: Fri, 24 Sep 2021 15:01:32 -0400 Subject: ptp: clockmatrix: use rsmu driver to access i2c/spi bus rsmu (Renesas Synchronization Management Unit ) driver is located in drivers/mfd and responsible for creating multiple devices including clockmatrix phc, which will then use the exposed regmap and mutex handle to access i2c/spi bus. Signed-off-by: Min Li Signed-off-by: David S. Miller --- include/linux/mfd/idt8a340_reg.h | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/idt8a340_reg.h b/include/linux/mfd/idt8a340_reg.h index 92d763230bdf..a18c1539a152 100644 --- a/include/linux/mfd/idt8a340_reg.h +++ b/include/linux/mfd/idt8a340_reg.h @@ -506,6 +506,10 @@ #define STATE_MODE_SHIFT (0) #define STATE_MODE_MASK (0x7) +/* Bit definitions for the DPLL_MANU_REF_CFG register */ +#define MANUAL_REFERENCE_SHIFT (0) +#define MANUAL_REFERENCE_MASK (0x1f) + /* Bit definitions for the GPIO_CFG_GBL register */ #define SUPPLY_MODE_SHIFT (0) #define SUPPLY_MODE_MASK (0x3) @@ -654,7 +658,7 @@ /* Values of DPLL_N.DPLL_MODE.PLL_MODE */ enum pll_mode { PLL_MODE_MIN = 0, - PLL_MODE_NORMAL = PLL_MODE_MIN, + PLL_MODE_PLL = PLL_MODE_MIN, PLL_MODE_WRITE_PHASE = 1, PLL_MODE_WRITE_FREQUENCY = 2, PLL_MODE_GPIO_INC_DEC = 3, @@ -664,6 +668,31 @@ enum pll_mode { PLL_MODE_MAX = PLL_MODE_DISABLED, }; +/* Values of DPLL_CTRL_n.DPLL_MANU_REF_CFG.MANUAL_REFERENCE */ +enum manual_reference { + MANU_REF_MIN = 0, + MANU_REF_CLK0 = MANU_REF_MIN, + MANU_REF_CLK1, + MANU_REF_CLK2, + MANU_REF_CLK3, + MANU_REF_CLK4, + MANU_REF_CLK5, + MANU_REF_CLK6, + MANU_REF_CLK7, + MANU_REF_CLK8, + MANU_REF_CLK9, + MANU_REF_CLK10, + MANU_REF_CLK11, + MANU_REF_CLK12, + MANU_REF_CLK13, + MANU_REF_CLK14, + MANU_REF_CLK15, + MANU_REF_WRITE_PHASE, + MANU_REF_WRITE_FREQUENCY, + MANU_REF_XO_DPLL, + MANU_REF_MAX = MANU_REF_XO_DPLL, +}; + enum hw_tod_write_trig_sel { HW_TOD_WR_TRIG_SEL_MIN = 0, HW_TOD_WR_TRIG_SEL_MSB = HW_TOD_WR_TRIG_SEL_MIN, -- cgit v1.2.3 From f6bc909e7673c30abcbdb329e7d0aa2e83c103d7 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Mon, 13 Sep 2021 17:00:57 +0100 Subject: firmware: cs_dsp: add driver to support firmware loading on Cirrus Logic DSPs wm_adsp originally provided firmware loading on some audio DSP and was implemented as an ASoC codec driver. However, the firmware loading now covers a wider range of DSP cores and peripherals containing them, beyond just audio. So it needs to be available to non-audio drivers. All the core firmware loading support has been moved into a new driver cs_dsp, leaving only the ASoC-specific parts in wm_adsp. Signed-off-by: Simon Trimmer Signed-off-by: Charles Keepax Link: https://lore.kernel.org/r/20210913160057.103842-17-simont@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 242 +++++++++++++++++++++++++++++++++ include/linux/firmware/cirrus/wmfw.h | 202 +++++++++++++++++++++++++++ 2 files changed, 444 insertions(+) create mode 100644 include/linux/firmware/cirrus/cs_dsp.h create mode 100644 include/linux/firmware/cirrus/wmfw.h (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h new file mode 100644 index 000000000000..9ad9eaaaa552 --- /dev/null +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * cs_dsp.h -- Cirrus Logic DSP firmware support + * + * Based on sound/soc/codecs/wm_adsp.h + * + * Copyright 2012 Wolfson Microelectronics plc + * Copyright (C) 2015-2021 Cirrus Logic, Inc. and + * Cirrus Logic International Semiconductor Ltd. + */ +#ifndef __CS_DSP_H +#define __CS_DSP_H + +#define CS_ADSP2_REGION_0 BIT(0) +#define CS_ADSP2_REGION_1 BIT(1) +#define CS_ADSP2_REGION_2 BIT(2) +#define CS_ADSP2_REGION_3 BIT(3) +#define CS_ADSP2_REGION_4 BIT(4) +#define CS_ADSP2_REGION_5 BIT(5) +#define CS_ADSP2_REGION_6 BIT(6) +#define CS_ADSP2_REGION_7 BIT(7) +#define CS_ADSP2_REGION_8 BIT(8) +#define CS_ADSP2_REGION_9 BIT(9) +#define CS_ADSP2_REGION_1_9 (CS_ADSP2_REGION_1 | \ + CS_ADSP2_REGION_2 | CS_ADSP2_REGION_3 | \ + CS_ADSP2_REGION_4 | CS_ADSP2_REGION_5 | \ + CS_ADSP2_REGION_6 | CS_ADSP2_REGION_7 | \ + CS_ADSP2_REGION_8 | CS_ADSP2_REGION_9) +#define CS_ADSP2_REGION_ALL (CS_ADSP2_REGION_0 | CS_ADSP2_REGION_1_9) + +#define CS_DSP_DATA_WORD_SIZE 3 + +#define CS_DSP_ACKED_CTL_TIMEOUT_MS 100 +#define CS_DSP_ACKED_CTL_N_QUICKPOLLS 10 +#define CS_DSP_ACKED_CTL_MIN_VALUE 0 +#define CS_DSP_ACKED_CTL_MAX_VALUE 0xFFFFFF + +/** + * struct cs_dsp_region - Describes a logical memory region in DSP address space + * @type: Memory region type + * @base: Address of region + */ +struct cs_dsp_region { + int type; + unsigned int base; +}; + +/** + * struct cs_dsp_alg_region - Describes a logical algorithm region in DSP address space + * @list: List node for internal use + * @alg: Algorithm id + * @type: Memory region type + * @base: Address of region + */ +struct cs_dsp_alg_region { + struct list_head list; + unsigned int alg; + int type; + unsigned int base; +}; + +/** + * struct cs_dsp_coeff_ctl - Describes a coefficient control + * @fw_name: Name of the firmware + * @subname: Name of the control parsed from the WMFW + * @subname_len: Length of subname + * @alg_region: Logical region associated with this control + * @dsp: DSP instance associated with this control + * @enabled: Flag indicating whether control is enabled + * @list: List node for internal use + * @cache: Cached value of the control + * @offset: Offset of control within alg_region + * @len: Length of the cached value + * @set: Flag indicating the value has been written by the user + * @flags: Bitfield of WMFW_CTL_FLAG_ control flags defined in wmfw.h + * @type: One of the WMFW_CTL_TYPE_ control types defined in wmfw.h + * @priv: For use by the client + */ +struct cs_dsp_coeff_ctl { + const char *fw_name; + /* Subname is needed to match with firmware */ + const char *subname; + unsigned int subname_len; + struct cs_dsp_alg_region alg_region; + struct cs_dsp *dsp; + unsigned int enabled:1; + struct list_head list; + void *cache; + unsigned int offset; + size_t len; + unsigned int set:1; + unsigned int flags; + unsigned int type; + + void *priv; +}; + +struct cs_dsp_ops; +struct cs_dsp_client_ops; + +/** + * struct cs_dsp - Configuration and state of a Cirrus Logic DSP + * @name: The name of the DSP instance + * @rev: Revision of the DSP + * @num: DSP instance number + * @type: Type of DSP + * @dev: Driver model representation of the device + * @regmap: Register map of the device + * @ops: Function pointers for internal callbacks + * @client_ops: Function pointers for client callbacks + * @base: Address of the DSP registers + * @base_sysinfo: Address of the sysinfo register (Halo only) + * @sysclk_reg: Address of the sysclk register (ADSP1 only) + * @sysclk_mask: Mask of frequency bits within sysclk register (ADSP1 only) + * @sysclk_shift: Shift of frequency bits within sysclk register (ADSP1 only) + * @alg_regions: List of currently loaded algorithm regions + * @fw_file_name: Filename of the current firmware + * @fw_name: Name of the current firmware + * @fw_id: ID of the current firmware, obtained from the wmfw + * @fw_id_version: Version of the firmware, obtained from the wmfw + * @fw_vendor_id: Vendor of the firmware, obtained from the wmfw + * @mem: DSP memory region descriptions + * @num_mems: Number of memory regions in this DSP + * @fw_ver: Version of the wmfw file format + * @booted: Flag indicating DSP has been configured + * @running: Flag indicating DSP is executing firmware + * @ctl_list: Controls defined within the loaded DSP firmware + * @lock_regions: Enable MPU traps on specified memory regions + * @pwr_lock: Lock used to serialize accesses + * @debugfs_root: Debugfs directory for this DSP instance + * @wmfw_file_name: Filename of the currently loaded firmware + * @bin_file_name: Filename of the currently loaded coefficients + */ +struct cs_dsp { + const char *name; + int rev; + int num; + int type; + struct device *dev; + struct regmap *regmap; + + const struct cs_dsp_ops *ops; + const struct cs_dsp_client_ops *client_ops; + + unsigned int base; + unsigned int base_sysinfo; + unsigned int sysclk_reg; + unsigned int sysclk_mask; + unsigned int sysclk_shift; + + struct list_head alg_regions; + + const char *fw_name; + unsigned int fw_id; + unsigned int fw_id_version; + unsigned int fw_vendor_id; + + const struct cs_dsp_region *mem; + int num_mems; + + int fw_ver; + + bool booted; + bool running; + + struct list_head ctl_list; + + struct mutex pwr_lock; + + unsigned int lock_regions; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_root; + char *wmfw_file_name; + char *bin_file_name; +#endif +}; + +/** + * struct cs_dsp_client_ops - client callbacks + * @control_add: Called under the pwr_lock when a control is created + * @control_remove: Called under the pwr_lock when a control is destroyed + * @post_run: Called under the pwr_lock by cs_dsp_run() + * @post_stop: Called under the pwr_lock by cs_dsp_stop() + * @watchdog_expired: Called when a watchdog expiry is detected + * + * These callbacks give the cs_dsp client an opportunity to respond to events + * or to perform actions atomically. + */ +struct cs_dsp_client_ops { + int (*control_add)(struct cs_dsp_coeff_ctl *ctl); + void (*control_remove)(struct cs_dsp_coeff_ctl *ctl); + int (*post_run)(struct cs_dsp *dsp); + void (*post_stop)(struct cs_dsp *dsp); + void (*watchdog_expired)(struct cs_dsp *dsp); +}; + +int cs_dsp_adsp1_init(struct cs_dsp *dsp); +int cs_dsp_adsp2_init(struct cs_dsp *dsp); +int cs_dsp_halo_init(struct cs_dsp *dsp); + +int cs_dsp_adsp1_power_up(struct cs_dsp *dsp, + const struct firmware *wmfw_firmware, char *wmfw_filename, + const struct firmware *coeff_firmware, char *coeff_filename, + const char *fw_name); +void cs_dsp_adsp1_power_down(struct cs_dsp *dsp); +int cs_dsp_power_up(struct cs_dsp *dsp, + const struct firmware *wmfw_firmware, char *wmfw_filename, + const struct firmware *coeff_firmware, char *coeff_filename, + const char *fw_name); +void cs_dsp_power_down(struct cs_dsp *dsp); +int cs_dsp_run(struct cs_dsp *dsp); +void cs_dsp_stop(struct cs_dsp *dsp); + +void cs_dsp_remove(struct cs_dsp *dsp); + +int cs_dsp_set_dspclk(struct cs_dsp *dsp, unsigned int freq); +void cs_dsp_adsp2_bus_error(struct cs_dsp *dsp); +void cs_dsp_halo_bus_error(struct cs_dsp *dsp); +void cs_dsp_halo_wdt_expire(struct cs_dsp *dsp); + +void cs_dsp_init_debugfs(struct cs_dsp *dsp, struct dentry *debugfs_root); +void cs_dsp_cleanup_debugfs(struct cs_dsp *dsp); + +int cs_dsp_coeff_write_acked_control(struct cs_dsp_coeff_ctl *ctl, unsigned int event_id); +int cs_dsp_coeff_write_ctrl(struct cs_dsp_coeff_ctl *ctl, const void *buf, size_t len); +int cs_dsp_coeff_read_ctrl(struct cs_dsp_coeff_ctl *ctl, void *buf, size_t len); +struct cs_dsp_coeff_ctl *cs_dsp_get_ctl(struct cs_dsp *dsp, const char *name, int type, + unsigned int alg); + +int cs_dsp_read_raw_data_block(struct cs_dsp *dsp, int mem_type, unsigned int mem_addr, + unsigned int num_words, __be32 *data); +int cs_dsp_read_data_word(struct cs_dsp *dsp, int mem_type, unsigned int mem_addr, u32 *data); +int cs_dsp_write_data_word(struct cs_dsp *dsp, int mem_type, unsigned int mem_addr, u32 data); +void cs_dsp_remove_padding(u32 *buf, int nwords); + +struct cs_dsp_alg_region *cs_dsp_find_alg_region(struct cs_dsp *dsp, + int type, unsigned int id); + +const char *cs_dsp_mem_region_name(unsigned int type); + +#endif diff --git a/include/linux/firmware/cirrus/wmfw.h b/include/linux/firmware/cirrus/wmfw.h new file mode 100644 index 000000000000..a19bf7c6fc8b --- /dev/null +++ b/include/linux/firmware/cirrus/wmfw.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * wmfw.h - Wolfson firmware format information + * + * Copyright 2012 Wolfson Microelectronics plc + * + * Author: Mark Brown + */ + +#ifndef __WMFW_H +#define __WMFW_H + +#include + +#define WMFW_MAX_ALG_NAME 256 +#define WMFW_MAX_ALG_DESCR_NAME 256 + +#define WMFW_MAX_COEFF_NAME 256 +#define WMFW_MAX_COEFF_DESCR_NAME 256 + +#define WMFW_CTL_FLAG_SYS 0x8000 +#define WMFW_CTL_FLAG_VOLATILE 0x0004 +#define WMFW_CTL_FLAG_WRITEABLE 0x0002 +#define WMFW_CTL_FLAG_READABLE 0x0001 + +#define WMFW_CTL_TYPE_BYTES 0x0004 /* byte control */ + +/* Non-ALSA coefficient types start at 0x1000 */ +#define WMFW_CTL_TYPE_ACKED 0x1000 /* acked control */ +#define WMFW_CTL_TYPE_HOSTEVENT 0x1001 /* event control */ +#define WMFW_CTL_TYPE_HOST_BUFFER 0x1002 /* host buffer pointer */ + +struct wmfw_header { + char magic[4]; + __le32 len; + __le16 rev; + u8 core; + u8 ver; +} __packed; + +struct wmfw_footer { + __le64 timestamp; + __le32 checksum; +} __packed; + +struct wmfw_adsp1_sizes { + __le32 dm; + __le32 pm; + __le32 zm; +} __packed; + +struct wmfw_adsp2_sizes { + __le32 xm; + __le32 ym; + __le32 pm; + __le32 zm; +} __packed; + +struct wmfw_region { + union { + __be32 type; + __le32 offset; + }; + __le32 len; + u8 data[]; +} __packed; + +struct wmfw_id_hdr { + __be32 core_id; + __be32 core_rev; + __be32 id; + __be32 ver; +} __packed; + +struct wmfw_v3_id_hdr { + __be32 core_id; + __be32 block_rev; + __be32 vendor_id; + __be32 id; + __be32 ver; +} __packed; + +struct wmfw_adsp1_id_hdr { + struct wmfw_id_hdr fw; + __be32 zm; + __be32 dm; + __be32 n_algs; +} __packed; + +struct wmfw_adsp2_id_hdr { + struct wmfw_id_hdr fw; + __be32 zm; + __be32 xm; + __be32 ym; + __be32 n_algs; +} __packed; + +struct wmfw_halo_id_hdr { + struct wmfw_v3_id_hdr fw; + __be32 xm_base; + __be32 xm_size; + __be32 ym_base; + __be32 ym_size; + __be32 n_algs; +} __packed; + +struct wmfw_alg_hdr { + __be32 id; + __be32 ver; +} __packed; + +struct wmfw_adsp1_alg_hdr { + struct wmfw_alg_hdr alg; + __be32 zm; + __be32 dm; +} __packed; + +struct wmfw_adsp2_alg_hdr { + struct wmfw_alg_hdr alg; + __be32 zm; + __be32 xm; + __be32 ym; +} __packed; + +struct wmfw_halo_alg_hdr { + struct wmfw_alg_hdr alg; + __be32 xm_base; + __be32 xm_size; + __be32 ym_base; + __be32 ym_size; +} __packed; + +struct wmfw_adsp_alg_data { + __le32 id; + u8 name[WMFW_MAX_ALG_NAME]; + u8 descr[WMFW_MAX_ALG_DESCR_NAME]; + __le32 ncoeff; + u8 data[]; +} __packed; + +struct wmfw_adsp_coeff_data { + struct { + __le16 offset; + __le16 type; + __le32 size; + } hdr; + u8 name[WMFW_MAX_COEFF_NAME]; + u8 descr[WMFW_MAX_COEFF_DESCR_NAME]; + __le16 ctl_type; + __le16 flags; + __le32 len; + u8 data[]; +} __packed; + +struct wmfw_coeff_hdr { + u8 magic[4]; + __le32 len; + union { + __be32 rev; + __le32 ver; + }; + union { + __be32 core; + __le32 core_ver; + }; + u8 data[]; +} __packed; + +struct wmfw_coeff_item { + __le16 offset; + __le16 type; + __le32 id; + __le32 ver; + __le32 sr; + __le32 len; + u8 data[]; +} __packed; + +#define WMFW_ADSP1 1 +#define WMFW_ADSP2 2 +#define WMFW_HALO 4 + +#define WMFW_ABSOLUTE 0xf0 +#define WMFW_ALGORITHM_DATA 0xf2 +#define WMFW_METADATA 0xfc +#define WMFW_NAME_TEXT 0xfe +#define WMFW_INFO_TEXT 0xff + +#define WMFW_ADSP1_PM 2 +#define WMFW_ADSP1_DM 3 +#define WMFW_ADSP1_ZM 4 + +#define WMFW_ADSP2_PM 2 +#define WMFW_ADSP2_ZM 4 +#define WMFW_ADSP2_XM 5 +#define WMFW_ADSP2_YM 6 + +#define WMFW_HALO_PM_PACKED 0x10 +#define WMFW_HALO_XM_PACKED 0x11 +#define WMFW_HALO_YM_PACKED 0x12 + +#endif -- cgit v1.2.3 From c25303281d79299e9f35d4b2e496a8bd134d5715 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Jun 2021 23:20:17 -0400 Subject: mm: Convert get_page_unless_zero() to return bool atomic_add_unless() returns bool, so remove the widening casts to int in page_ref_add_unless() and get_page_unless_zero(). This causes gcc to produce slightly larger code in isolate_migratepages_block(), but it's not clear that it's worse code. Net +19 bytes of text. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Kirill A. Shutemov Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/mm.h | 2 +- include/linux/page_ref.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 73a52aba448f..1de8864a1e28 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -754,7 +754,7 @@ static inline int put_page_testzero(struct page *page) * This can be called when MMU is off so it must not access * any of the virtual mappings. */ -static inline int get_page_unless_zero(struct page *page) +static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 7ad46f45df39..3a799de8ad52 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -161,9 +161,9 @@ static inline int page_ref_dec_return(struct page *page) return ret; } -static inline int page_ref_add_unless(struct page *page, int nr, int u) +static inline bool page_ref_add_unless(struct page *page, int nr, int u) { - int ret = atomic_add_unless(&page->_refcount, nr, u); + bool ret = atomic_add_unless(&page->_refcount, nr, u); if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); -- cgit v1.2.3 From 7b230db3b8d373219f88a3d25c8fbbf12cc7f233 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 6 Dec 2020 22:22:48 -0500 Subject: mm: Introduce struct folio A struct folio is a new abstraction to replace the venerable struct page. A function which takes a struct folio argument declares that it will operate on the entire (possibly compound) page, not just PAGE_SIZE bytes. In return, the caller guarantees that the pointer it is passing does not point to a tail page. No change to generated code. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/mm.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 60 +++++++++++++++++++++++++++++++++++++ include/linux/page-flags.h | 28 +++++++++++++++++ 3 files changed, 163 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1de8864a1e28..9057b8406acf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -950,6 +950,20 @@ static inline unsigned int compound_order(struct page *page) return page[1].compound_order; } +/** + * folio_order - The allocation order of a folio. + * @folio: The folio. + * + * A folio is composed of 2^order pages. See get_order() for the definition + * of order. + * + * Return: The order of the folio. + */ +static inline unsigned int folio_order(struct folio *folio) +{ + return compound_order(&folio->page); +} + static inline bool hpage_pincount_available(struct page *page) { /* @@ -1595,6 +1609,66 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } +/** + * folio_nr_pages - The number of pages in the folio. + * @folio: The folio. + * + * Return: A positive power of two. + */ +static inline long folio_nr_pages(struct folio *folio) +{ + return compound_nr(&folio->page); +} + +/** + * folio_next - Move to the next physical folio. + * @folio: The folio we're currently operating on. + * + * If you have physically contiguous memory which may span more than + * one folio (eg a &struct bio_vec), use this function to move from one + * folio to the next. Do not use it if the memory is only virtually + * contiguous as the folios are almost certainly not adjacent to each + * other. This is the folio equivalent to writing ``page++``. + * + * Context: We assume that the folios are refcounted and/or locked at a + * higher level and do not adjust the reference counts. + * Return: The next struct folio. + */ +static inline struct folio *folio_next(struct folio *folio) +{ + return (struct folio *)folio_page(folio, folio_nr_pages(folio)); +} + +/** + * folio_shift - The size of the memory described by this folio. + * @folio: The folio. + * + * A folio represents a number of bytes which is a power-of-two in size. + * This function tells you which power-of-two the folio is. See also + * folio_size() and folio_order(). + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The base-2 logarithm of the size of this folio. + */ +static inline unsigned int folio_shift(struct folio *folio) +{ + return PAGE_SHIFT + folio_order(folio); +} + +/** + * folio_size - The number of bytes in a folio. + * @folio: The folio. + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The number of bytes in this folio. + */ +static inline size_t folio_size(struct folio *folio) +{ + return PAGE_SIZE << folio_order(folio); +} + /* * Some inline functions in vmstat.h depend on page_zone() */ @@ -1700,6 +1774,7 @@ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) +#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* * Flags passed to show_mem() and show_free_areas() to suppress output in diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..6908186629b4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -239,6 +239,66 @@ struct page { #endif } _struct_page_alignment; +/** + * struct folio - Represents a contiguous set of bytes. + * @flags: Identical to the page flags. + * @lru: Least Recently Used list; tracks how recently this folio was used. + * @mapping: The file this page belongs to, or refers to the anon_vma for + * anonymous memory. + * @index: Offset within the file, in units of pages. For anonymous memory, + * this is the index from the beginning of the mmap. + * @private: Filesystem per-folio data (see folio_attach_private()). + * Used for swp_entry_t if folio_test_swapcache(). + * @_mapcount: Do not access this member directly. Use folio_mapcount() to + * find out how many times this folio is mapped by userspace. + * @_refcount: Do not access this member directly. Use folio_ref_count() + * to find how many references there are to this folio. + * @memcg_data: Memory Control Group data. + * + * A folio is a physically, virtually and logically contiguous set + * of bytes. It is a power-of-two in size, and it is aligned to that + * same power-of-two. It is at least as large as %PAGE_SIZE. If it is + * in the page cache, it is at a file offset which is a multiple of that + * power-of-two. It may be mapped into userspace at an address which is + * at an arbitrary page offset, but its kernel virtual address is aligned + * to its size. + */ +struct folio { + /* private: don't document the anon union */ + union { + struct { + /* public: */ + unsigned long flags; + struct list_head lru; + struct address_space *mapping; + pgoff_t index; + void *private; + atomic_t _mapcount; + atomic_t _refcount; +#ifdef CONFIG_MEMCG + unsigned long memcg_data; +#endif + /* private: the union with struct page is transitional */ + }; + struct page page; + }; +}; + +static_assert(sizeof(struct page) == sizeof(struct folio)); +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) +FOLIO_MATCH(flags, flags); +FOLIO_MATCH(lru, lru); +FOLIO_MATCH(compound_head, lru); +FOLIO_MATCH(index, index); +FOLIO_MATCH(private, private); +FOLIO_MATCH(_mapcount, _mapcount); +FOLIO_MATCH(_refcount, _refcount); +#ifdef CONFIG_MEMCG +FOLIO_MATCH(memcg_data, memcg_data); +#endif +#undef FOLIO_MATCH + static inline atomic_t *compound_mapcount_ptr(struct page *page) { return &page[1].compound_mapcount; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a558d67ee86f..e9b0723a637d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -193,6 +193,34 @@ static inline unsigned long _compound_head(const struct page *page) #define compound_head(page) ((typeof(page))_compound_head(page)) +/** + * page_folio - Converts from page to folio. + * @p: The page. + * + * Every page is part of a folio. This function cannot be called on a + * NULL pointer. + * + * Context: No reference, nor lock is required on @page. If the caller + * does not hold a reference, this call may race with a folio split, so + * it should re-check the folio still contains this page after gaining + * a reference on the folio. + * Return: The folio which contains this page. + */ +#define page_folio(p) (_Generic((p), \ + const struct page *: (const struct folio *)_compound_head(p), \ + struct page *: (struct folio *)_compound_head(p))) + +/** + * folio_page - Return a page from a folio. + * @folio: The folio. + * @n: The page number to return. + * + * @n is relative to the start of the folio. This function does not + * check that the page number lies within @folio; the caller is presumed + * to have a reference to the page. + */ +#define folio_page(folio, n) nth_page(&(folio)->page, n) + static __always_inline int PageTail(struct page *page) { return READ_ONCE(page->compound_head) & 1; -- cgit v1.2.3 From 32b8fc486524044f2643c5d3340fa436b761ba71 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 18 Jan 2021 07:40:36 -0500 Subject: mm: Add folio_pgdat(), folio_zone() and folio_zonenum() These are just convenience wrappers for callers with folios; pgdat and zone can be reached from tail pages as well as head pages. No change to generated code. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/mm.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9057b8406acf..3fc524f8c2a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1145,6 +1145,11 @@ static inline enum zone_type page_zonenum(const struct page *page) return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } +static inline enum zone_type folio_zonenum(const struct folio *folio) +{ + return page_zonenum(&folio->page); +} + #ifdef CONFIG_ZONE_DEVICE static inline bool is_zone_device_page(const struct page *page) { @@ -1560,6 +1565,16 @@ static inline pg_data_t *page_pgdat(const struct page *page) return NODE_DATA(page_to_nid(page)); } +static inline struct zone *folio_zone(const struct folio *folio) +{ + return page_zone(&folio->page); +} + +static inline pg_data_t *folio_pgdat(const struct folio *folio) +{ + return page_pgdat(&folio->page); +} + #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { -- cgit v1.2.3 From a53e17e4e97b4519882d640984a386c522e9fba3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 18 Jan 2021 08:14:00 -0500 Subject: mm/vmstat: Add functions to account folio statistics Allow page counters to be more readily modified by callers which have a folio. Name these wrappers with 'stat' instead of 'state' as requested by Linus here: https://lore.kernel.org/linux-mm/CAHk-=wj847SudR-kt+46fT3+xFFgiwpgThvm7DJWGdi4cVrbnQ@mail.gmail.com/ No change to generated code. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/vmstat.h | 107 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d6a6cf53b127..241bd0f53fb9 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -415,6 +415,78 @@ static inline void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { } #endif /* CONFIG_SMP */ +static inline void __zone_stat_mod_folio(struct folio *folio, + enum zone_stat_item item, long nr) +{ + __mod_zone_page_state(folio_zone(folio), item, nr); +} + +static inline void __zone_stat_add_folio(struct folio *folio, + enum zone_stat_item item) +{ + __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); +} + +static inline void __zone_stat_sub_folio(struct folio *folio, + enum zone_stat_item item) +{ + __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); +} + +static inline void zone_stat_mod_folio(struct folio *folio, + enum zone_stat_item item, long nr) +{ + mod_zone_page_state(folio_zone(folio), item, nr); +} + +static inline void zone_stat_add_folio(struct folio *folio, + enum zone_stat_item item) +{ + mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); +} + +static inline void zone_stat_sub_folio(struct folio *folio, + enum zone_stat_item item) +{ + mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); +} + +static inline void __node_stat_mod_folio(struct folio *folio, + enum node_stat_item item, long nr) +{ + __mod_node_page_state(folio_pgdat(folio), item, nr); +} + +static inline void __node_stat_add_folio(struct folio *folio, + enum node_stat_item item) +{ + __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); +} + +static inline void __node_stat_sub_folio(struct folio *folio, + enum node_stat_item item) +{ + __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); +} + +static inline void node_stat_mod_folio(struct folio *folio, + enum node_stat_item item, long nr) +{ + mod_node_page_state(folio_pgdat(folio), item, nr); +} + +static inline void node_stat_add_folio(struct folio *folio, + enum node_stat_item item) +{ + mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); +} + +static inline void node_stat_sub_folio(struct folio *folio, + enum node_stat_item item) +{ + mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); +} + static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, int migratetype) { @@ -543,6 +615,24 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } +static inline void __lruvec_stat_mod_folio(struct folio *folio, + enum node_stat_item idx, int val) +{ + __mod_lruvec_page_state(&folio->page, idx, val); +} + +static inline void __lruvec_stat_add_folio(struct folio *folio, + enum node_stat_item idx) +{ + __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); +} + +static inline void __lruvec_stat_sub_folio(struct folio *folio, + enum node_stat_item idx) +{ + __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); +} + static inline void inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { @@ -555,4 +645,21 @@ static inline void dec_lruvec_page_state(struct page *page, mod_lruvec_page_state(page, idx, -1); } +static inline void lruvec_stat_mod_folio(struct folio *folio, + enum node_stat_item idx, int val) +{ + mod_lruvec_page_state(&folio->page, idx, val); +} + +static inline void lruvec_stat_add_folio(struct folio *folio, + enum node_stat_item idx) +{ + lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); +} + +static inline void lruvec_stat_sub_folio(struct folio *folio, + enum node_stat_item idx) +{ + lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); +} #endif /* _LINUX_VMSTAT_H */ -- cgit v1.2.3 From 9e9edb2094db7eb4c6da21e02ac885955350ecf9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Jan 2021 10:52:37 -0500 Subject: mm/debug: Add VM_BUG_ON_FOLIO() and VM_WARN_ON_ONCE_FOLIO() These are the folio equivalents of VM_BUG_ON_PAGE and VM_WARN_ON_ONCE_PAGE. No change to generated code. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/mmdebug.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 1935d4c72d10..d7285f8148a3 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -22,6 +22,13 @@ void dump_mm(const struct mm_struct *mm); BUG(); \ } \ } while (0) +#define VM_BUG_ON_FOLIO(cond, folio) \ + do { \ + if (unlikely(cond)) { \ + dump_page(&folio->page, "VM_BUG_ON_FOLIO(" __stringify(cond)")");\ + BUG(); \ + } \ + } while (0) #define VM_BUG_ON_VMA(cond, vma) \ do { \ if (unlikely(cond)) { \ @@ -47,6 +54,17 @@ void dump_mm(const struct mm_struct *mm); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_ONCE_FOLIO(cond, folio) ({ \ + static bool __section(".data.once") __warned; \ + int __ret_warn_once = !!(cond); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + dump_page(&folio->page, "VM_WARN_ON_ONCE_FOLIO(" __stringify(cond)")");\ + __warned = true; \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn_once); \ +}) #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) @@ -55,11 +73,13 @@ void dump_mm(const struct mm_struct *mm); #else #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) +#define VM_BUG_ON_FOLIO(cond, folio) VM_BUG_ON(cond) #define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond) #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif -- cgit v1.2.3 From c24016ac3a629655ea164b1129816660187943c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 31 Mar 2021 10:39:55 -0400 Subject: mm: Add folio reference count functions These functions mirror their page reference counterparts. Also add the kernel-doc to the mm-api and correct the return type of page_ref_add_unless() to bool. No change to generated code. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/page_ref.h | 88 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 3a799de8ad52..717d53c9ddf1 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -67,9 +67,31 @@ static inline int page_ref_count(const struct page *page) return atomic_read(&page->_refcount); } +/** + * folio_ref_count - The reference count on this folio. + * @folio: The folio. + * + * The refcount is usually incremented by calls to folio_get() and + * decremented by calls to folio_put(). Some typical users of the + * folio refcount: + * + * - Each reference from a page table + * - The page cache + * - Filesystem private data + * - The LRU list + * - Pipes + * - Direct IO which references this page in the process address space + * + * Return: The number of references to this folio. + */ +static inline int folio_ref_count(const struct folio *folio) +{ + return page_ref_count(&folio->page); +} + static inline int page_count(const struct page *page) { - return atomic_read(&compound_head(page)->_refcount); + return folio_ref_count(page_folio(page)); } static inline void set_page_count(struct page *page, int v) @@ -79,6 +101,11 @@ static inline void set_page_count(struct page *page, int v) __page_ref_set(page, v); } +static inline void folio_set_count(struct folio *folio, int v) +{ + set_page_count(&folio->page, v); +} + /* * Setup the page count before being freed into the page allocator for * the first time (boot or memory hotplug) @@ -95,6 +122,11 @@ static inline void page_ref_add(struct page *page, int nr) __page_ref_mod(page, nr); } +static inline void folio_ref_add(struct folio *folio, int nr) +{ + page_ref_add(&folio->page, nr); +} + static inline void page_ref_sub(struct page *page, int nr) { atomic_sub(nr, &page->_refcount); @@ -102,6 +134,11 @@ static inline void page_ref_sub(struct page *page, int nr) __page_ref_mod(page, -nr); } +static inline void folio_ref_sub(struct folio *folio, int nr) +{ + page_ref_sub(&folio->page, nr); +} + static inline int page_ref_sub_return(struct page *page, int nr) { int ret = atomic_sub_return(nr, &page->_refcount); @@ -111,6 +148,11 @@ static inline int page_ref_sub_return(struct page *page, int nr) return ret; } +static inline int folio_ref_sub_return(struct folio *folio, int nr) +{ + return page_ref_sub_return(&folio->page, nr); +} + static inline void page_ref_inc(struct page *page) { atomic_inc(&page->_refcount); @@ -118,6 +160,11 @@ static inline void page_ref_inc(struct page *page) __page_ref_mod(page, 1); } +static inline void folio_ref_inc(struct folio *folio) +{ + page_ref_inc(&folio->page); +} + static inline void page_ref_dec(struct page *page) { atomic_dec(&page->_refcount); @@ -125,6 +172,11 @@ static inline void page_ref_dec(struct page *page) __page_ref_mod(page, -1); } +static inline void folio_ref_dec(struct folio *folio) +{ + page_ref_dec(&folio->page); +} + static inline int page_ref_sub_and_test(struct page *page, int nr) { int ret = atomic_sub_and_test(nr, &page->_refcount); @@ -134,6 +186,11 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) return ret; } +static inline int folio_ref_sub_and_test(struct folio *folio, int nr) +{ + return page_ref_sub_and_test(&folio->page, nr); +} + static inline int page_ref_inc_return(struct page *page) { int ret = atomic_inc_return(&page->_refcount); @@ -143,6 +200,11 @@ static inline int page_ref_inc_return(struct page *page) return ret; } +static inline int folio_ref_inc_return(struct folio *folio) +{ + return page_ref_inc_return(&folio->page); +} + static inline int page_ref_dec_and_test(struct page *page) { int ret = atomic_dec_and_test(&page->_refcount); @@ -152,6 +214,11 @@ static inline int page_ref_dec_and_test(struct page *page) return ret; } +static inline int folio_ref_dec_and_test(struct folio *folio) +{ + return page_ref_dec_and_test(&folio->page); +} + static inline int page_ref_dec_return(struct page *page) { int ret = atomic_dec_return(&page->_refcount); @@ -161,6 +228,11 @@ static inline int page_ref_dec_return(struct page *page) return ret; } +static inline int folio_ref_dec_return(struct folio *folio) +{ + return page_ref_dec_return(&folio->page); +} + static inline bool page_ref_add_unless(struct page *page, int nr, int u) { bool ret = atomic_add_unless(&page->_refcount, nr, u); @@ -170,6 +242,11 @@ static inline bool page_ref_add_unless(struct page *page, int nr, int u) return ret; } +static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) +{ + return page_ref_add_unless(&folio->page, nr, u); +} + static inline int page_ref_freeze(struct page *page, int count) { int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count); @@ -179,6 +256,11 @@ static inline int page_ref_freeze(struct page *page, int count) return ret; } +static inline int folio_ref_freeze(struct folio *folio, int count) +{ + return page_ref_freeze(&folio->page, count); +} + static inline void page_ref_unfreeze(struct page *page, int count) { VM_BUG_ON_PAGE(page_count(page) != 0, page); @@ -189,4 +271,8 @@ static inline void page_ref_unfreeze(struct page *page, int count) __page_ref_unfreeze(page, count); } +static inline void folio_ref_unfreeze(struct folio *folio, int count) +{ + page_ref_unfreeze(&folio->page, count); +} #endif -- cgit v1.2.3 From b620f63358cd35c8e9084ee9fc460153f64714f6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 6 Dec 2020 23:04:57 -0500 Subject: mm: Add folio_put() If we know we have a folio, we can call folio_put() instead of put_page() and save the overhead of calling compound_head(). Also skips the devmap checks. This commit looks like it should be a no-op, but actually saves 684 bytes of text with the distro-derived config that I'm testing. Some functions grow a little while others shrink. I presume the compiler is making different inlining decisions. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/mm.h | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3fc524f8c2a2..28a84d82247b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -748,6 +748,11 @@ static inline int put_page_testzero(struct page *page) return page_ref_dec_and_test(page); } +static inline int folio_put_testzero(struct folio *folio) +{ + return put_page_testzero(&folio->page); +} + /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. @@ -1247,9 +1252,28 @@ static inline __must_check bool try_get_page(struct page *page) return true; } +/** + * folio_put - Decrement the reference count on a folio. + * @folio: The folio. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put() unless you can be sure that it wasn't the + * last reference. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put(struct folio *folio) +{ + if (folio_put_testzero(folio)) + __put_page(&folio->page); +} + static inline void put_page(struct page *page) { - page = compound_head(page); + struct folio *folio = page_folio(page); /* * For devmap managed pages we need to catch refcount transition from @@ -1257,13 +1281,12 @@ static inline void put_page(struct page *page) * need to inform the device driver through callback. See * include/linux/memremap.h and HMM for details. */ - if (page_is_devmap_managed(page)) { - put_devmap_managed_page(page); + if (page_is_devmap_managed(&folio->page)) { + put_devmap_managed_page(&folio->page); return; } - if (put_page_testzero(page)) - __put_page(page); + folio_put(folio); } /* -- cgit v1.2.3 From 86d234cb0499c6466ccfc45f6501bc0cd4621c60 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 6 Dec 2020 23:04:57 -0500 Subject: mm: Add folio_get() If we know we have a folio, we can call folio_get() instead of get_page() and save the overhead of calling compound_head(). No change to generated code. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/mm.h | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 28a84d82247b..63685d953ce0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1224,18 +1224,26 @@ static inline bool is_pci_p2pdma_page(const struct page *page) } /* 127: arbitrary random number, small enough to assemble well */ -#define page_ref_zero_or_close_to_overflow(page) \ - ((unsigned int) page_ref_count(page) + 127u <= 127u) +#define folio_ref_zero_or_close_to_overflow(folio) \ + ((unsigned int) folio_ref_count(folio) + 127u <= 127u) + +/** + * folio_get - Increment the reference count on a folio. + * @folio: The folio. + * + * Context: May be called in any context, as long as you know that + * you have a refcount on the folio. If you do not already have one, + * folio_try_get() may be the right interface for you to use. + */ +static inline void folio_get(struct folio *folio) +{ + VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); + folio_ref_inc(folio); +} static inline void get_page(struct page *page) { - page = compound_head(page); - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_refcount. - */ - VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); - page_ref_inc(page); + folio_get(page_folio(page)); } bool __must_check try_grab_page(struct page *page, unsigned int flags); -- cgit v1.2.3 From 020853b6f5ead2849b055e9873ff7267ce584256 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 May 2021 16:33:22 -0400 Subject: mm: Add folio_try_get_rcu() This is the equivalent of page_cache_get_speculative(). Also add folio_ref_try_add_rcu (the equivalent of page_cache_add_speculative) and folio_get_unless_zero() (the equivalent of get_page_unless_zero()). The new kernel-doc attempts to explain from the user's point of view when to use folio_try_get_rcu() and when to use folio_get_unless_zero(), because there seems to be some confusion currently between the users of page_cache_get_speculative() and get_page_unless_zero(). Reimplement page_cache_add_speculative() and page_cache_get_speculative() as wrappers around the folio equivalents, but leave get_page_unless_zero() alone for now. This commit reduces text size by 3 bytes due to slightly different register allocation & instruction selections. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Acked-by: Kirill A. Shutemov Acked-by: Mike Rapoport --- include/linux/page_ref.h | 66 +++++++++++++++++++++++++++++++++++++ include/linux/pagemap.h | 84 +++--------------------------------------------- 2 files changed, 70 insertions(+), 80 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 717d53c9ddf1..2e677e6ad09f 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -247,6 +247,72 @@ static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u) return page_ref_add_unless(&folio->page, nr, u); } +/** + * folio_try_get - Attempt to increase the refcount on a folio. + * @folio: The folio. + * + * If you do not already have a reference to a folio, you can attempt to + * get one using this function. It may fail if, for example, the folio + * has been freed since you found a pointer to it, or it is frozen for + * the purposes of splitting or migration. + * + * Return: True if the reference count was successfully incremented. + */ +static inline bool folio_try_get(struct folio *folio) +{ + return folio_ref_add_unless(folio, 1, 0); +} + +static inline bool folio_ref_try_add_rcu(struct folio *folio, int count) +{ +#ifdef CONFIG_TINY_RCU + /* + * The caller guarantees the folio will not be freed from interrupt + * context, so (on !SMP) we only need preemption to be disabled + * and TINY_RCU does that for us. + */ +# ifdef CONFIG_PREEMPT_COUNT + VM_BUG_ON(!in_atomic() && !irqs_disabled()); +# endif + VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio); + folio_ref_add(folio, count); +#else + if (unlikely(!folio_ref_add_unless(folio, count, 0))) { + /* Either the folio has been freed, or will be freed. */ + return false; + } +#endif + return true; +} + +/** + * folio_try_get_rcu - Attempt to increase the refcount on a folio. + * @folio: The folio. + * + * This is a version of folio_try_get() optimised for non-SMP kernels. + * If you are still holding the rcu_read_lock() after looking up the + * page and know that the page cannot have its refcount decreased to + * zero in interrupt context, you can use this instead of folio_try_get(). + * + * Example users include get_user_pages_fast() (as pages are not unmapped + * from interrupt context) and the page cache lookups (as pages are not + * truncated from interrupt context). We also know that pages are not + * frozen in interrupt context for the purposes of splitting or migration. + * + * You can also use this function if you're holding a lock that prevents + * pages being frozen & removed; eg the i_pages lock for the page cache + * or the mmap_sem or page table lock for page tables. In this case, + * it will always succeed, and you could have used a plain folio_get(), + * but it's sometimes more convenient to have a common function called + * from both locked and RCU-protected contexts. + * + * Return: True if the reference count was successfully incremented. + */ +static inline bool folio_try_get_rcu(struct folio *folio) +{ + return folio_ref_try_add_rcu(folio, 1); +} + static inline int page_ref_freeze(struct page *page, int count) { int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 62db6b0176b9..f3ec26551082 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -172,91 +172,15 @@ static inline struct address_space *page_mapping_file(struct page *page) return page_mapping(page); } -/* - * speculatively take a reference to a page. - * If the page is free (_refcount == 0), then _refcount is untouched, and 0 - * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned. - * - * This function must be called inside the same rcu_read_lock() section as has - * been used to lookup the page in the pagecache radix-tree (or page table): - * this allows allocators to use a synchronize_rcu() to stabilize _refcount. - * - * Unless an RCU grace period has passed, the count of all pages coming out - * of the allocator must be considered unstable. page_count may return higher - * than expected, and put_page must be able to do the right thing when the - * page has been finished with, no matter what it is subsequently allocated - * for (because put_page is what is used here to drop an invalid speculative - * reference). - * - * This is the interesting part of the lockless pagecache (and lockless - * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page) - * has the following pattern: - * 1. find page in radix tree - * 2. conditionally increment refcount - * 3. check the page is still in pagecache (if no, goto 1) - * - * Remove-side that cares about stability of _refcount (eg. reclaim) has the - * following (with the i_pages lock held): - * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) - * B. remove page from pagecache - * C. free the page - * - * There are 2 critical interleavings that matter: - * - 2 runs before A: in this case, A sees elevated refcount and bails out - * - A runs before 2: in this case, 2 sees zero refcount and retries; - * subsequently, B will complete and 1 will find no page, causing the - * lookup to return NULL. - * - * It is possible that between 1 and 2, the page is removed then the exact same - * page is inserted into the same position in pagecache. That's OK: the - * old find_get_page using a lock could equally have run before or after - * such a re-insertion, depending on order that locks are granted. - * - * Lookups racing against pagecache insertion isn't a big problem: either 1 - * will find the page or it will not. Likewise, the old find_get_page could run - * either before the insertion or afterwards, depending on timing. - */ -static inline int __page_cache_add_speculative(struct page *page, int count) +static inline bool page_cache_add_speculative(struct page *page, int count) { -#ifdef CONFIG_TINY_RCU -# ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic() && !irqs_disabled()); -# endif - /* - * Preempt must be disabled here - we rely on rcu_read_lock doing - * this for us. - * - * Pagecache won't be truncated from interrupt context, so if we have - * found a page in the radix tree here, we have pinned its refcount by - * disabling preempt, and hence no need for the "speculative get" that - * SMP requires. - */ - VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_add(page, count); - -#else - if (unlikely(!page_ref_add_unless(page, count, 0))) { - /* - * Either the page has been freed, or will be freed. - * In either case, retry here and the caller should - * do the right thing (see comments above). - */ - return 0; - } -#endif VM_BUG_ON_PAGE(PageTail(page), page); - - return 1; -} - -static inline int page_cache_get_speculative(struct page *page) -{ - return __page_cache_add_speculative(page, 1); + return folio_ref_try_add_rcu((struct folio *)page, count); } -static inline int page_cache_add_speculative(struct page *page, int count) +static inline bool page_cache_get_speculative(struct page *page) { - return __page_cache_add_speculative(page, count); + return page_cache_add_speculative(page, 1); } /** -- cgit v1.2.3 From d389a4a8115518e2cc232649b012b72113fe8b67 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 7 Dec 2020 15:42:09 -0500 Subject: mm: Add folio flag manipulation functions These new functions are the folio analogues of the various PageFlags functions. If CONFIG_DEBUG_VM_PGFLAGS is enabled, we check the folio is not a tail page at every invocation. This will also catch the PagePoisoned case as a poisoned page has every bit set, which would include PageTail. This saves 1684 bytes of text with the distro-derived config that I'm testing due to removing a double call to compound_head() in PageSwapCache(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/page-flags.h | 219 ++++++++++++++++++++++++++++++++------------- 1 file changed, 156 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e9b0723a637d..2491e84b8e52 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -143,6 +143,8 @@ enum pageflags { #endif __NR_PAGEFLAGS, + PG_readahead = PG_reclaim, + /* Filesystems */ PG_checked = PG_owner_priv_1, @@ -245,6 +247,15 @@ static inline void page_init_poison(struct page *page, size_t size) } #endif +static unsigned long *folio_flags(struct folio *folio, unsigned n) +{ + struct page *page = &folio->page; + + VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); + return &page[n].flags; +} + /* * Page flags policies wrt compound pages * @@ -289,36 +300,64 @@ static inline void page_init_poison(struct page *page, size_t size) VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ PF_POISONED_CHECK(&page[1]); }) +/* Which page is the flag stored in */ +#define FOLIO_PF_ANY 0 +#define FOLIO_PF_HEAD 0 +#define FOLIO_PF_ONLY_HEAD 0 +#define FOLIO_PF_NO_TAIL 0 +#define FOLIO_PF_NO_COMPOUND 0 +#define FOLIO_PF_SECOND 1 + /* * Macros to create function definitions for page flags */ #define TESTPAGEFLAG(uname, lname, policy) \ +static __always_inline bool folio_test_##lname(struct folio *folio) \ +{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int Page##uname(struct page *page) \ - { return test_bit(PG_##lname, &policy(page, 0)->flags); } +{ return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void folio_set_##lname(struct folio *folio) \ +{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void SetPage##uname(struct page *page) \ - { set_bit(PG_##lname, &policy(page, 1)->flags); } +{ set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void folio_clear_##lname(struct folio *folio) \ +{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void ClearPage##uname(struct page *page) \ - { clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void __folio_set_##lname(struct folio *folio) \ +{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void __SetPage##uname(struct page *page) \ - { __set_bit(PG_##lname, &policy(page, 1)->flags); } +{ __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ +static __always_inline \ +void __folio_clear_##lname(struct folio *folio) \ +{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline void __ClearPage##uname(struct page *page) \ - { __clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ +static __always_inline \ +bool folio_test_set_##lname(struct folio *folio) \ +{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int TestSetPage##uname(struct page *page) \ - { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ +static __always_inline \ +bool folio_test_clear_##lname(struct folio *folio) \ +{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ static __always_inline int TestClearPage##uname(struct page *page) \ - { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ @@ -334,29 +373,37 @@ static __always_inline int TestClearPage##uname(struct page *page) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) -#define TESTPAGEFLAG_FALSE(uname) \ +#define TESTPAGEFLAG_FALSE(uname, lname) \ +static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \ static inline int Page##uname(const struct page *page) { return 0; } -#define SETPAGEFLAG_NOOP(uname) \ +#define SETPAGEFLAG_NOOP(uname, lname) \ +static inline void folio_set_##lname(struct folio *folio) { } \ static inline void SetPage##uname(struct page *page) { } -#define CLEARPAGEFLAG_NOOP(uname) \ +#define CLEARPAGEFLAG_NOOP(uname, lname) \ +static inline void folio_clear_##lname(struct folio *folio) { } \ static inline void ClearPage##uname(struct page *page) { } -#define __CLEARPAGEFLAG_NOOP(uname) \ +#define __CLEARPAGEFLAG_NOOP(uname, lname) \ +static inline void __folio_clear_##lname(struct folio *folio) { } \ static inline void __ClearPage##uname(struct page *page) { } -#define TESTSETFLAG_FALSE(uname) \ +#define TESTSETFLAG_FALSE(uname, lname) \ +static inline bool folio_test_set_##lname(struct folio *folio) \ +{ return 0; } \ static inline int TestSetPage##uname(struct page *page) { return 0; } -#define TESTCLEARFLAG_FALSE(uname) \ +#define TESTCLEARFLAG_FALSE(uname, lname) \ +static inline bool folio_test_clear_##lname(struct folio *folio) \ +{ return 0; } \ static inline int TestClearPage##uname(struct page *page) { return 0; } -#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ - SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) +#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \ + SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname) -#define TESTSCFLAG_FALSE(uname) \ - TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) +#define TESTSCFLAG_FALSE(uname, lname) \ + TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) @@ -412,8 +459,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) -PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) - TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) +PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND) + TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND) #ifdef CONFIG_HIGHMEM /* @@ -422,22 +469,25 @@ PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #else -PAGEFLAG_FALSE(HighMem) +PAGEFLAG_FALSE(HighMem, highmem) #endif #ifdef CONFIG_SWAP -static __always_inline int PageSwapCache(struct page *page) +static __always_inline bool folio_test_swapcache(struct folio *folio) { -#ifdef CONFIG_THP_SWAP - page = compound_head(page); -#endif - return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags); + return folio_test_swapbacked(folio) && + test_bit(PG_swapcache, folio_flags(folio, 0)); +} +static __always_inline bool PageSwapCache(struct page *page) +{ + return folio_test_swapcache(page_folio(page)); } + SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) #else -PAGEFLAG_FALSE(SwapCache) +PAGEFLAG_FALSE(SwapCache, swapcache) #endif PAGEFLAG(Unevictable, unevictable, PF_HEAD) @@ -449,14 +499,14 @@ PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) #else -PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) - TESTSCFLAG_FALSE(Mlocked) +PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked) + TESTSCFLAG_FALSE(Mlocked, mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) #else -PAGEFLAG_FALSE(Uncached) +PAGEFLAG_FALSE(Uncached, uncached) #endif #ifdef CONFIG_MEMORY_FAILURE @@ -465,7 +515,7 @@ TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) extern bool take_page_off_buddy(struct page *page); #else -PAGEFLAG_FALSE(HWPoison) +PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 #endif @@ -479,7 +529,7 @@ PAGEFLAG(Idle, idle, PF_ANY) #ifdef CONFIG_KASAN_HW_TAGS PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD) #else -PAGEFLAG_FALSE(SkipKASanPoison) +PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison) #endif /* @@ -517,10 +567,14 @@ static __always_inline int PageMappingFlags(struct page *page) return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; } -static __always_inline int PageAnon(struct page *page) +static __always_inline bool folio_test_anon(struct folio *folio) +{ + return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0; +} + +static __always_inline bool PageAnon(struct page *page) { - page = compound_head(page); - return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; + return folio_test_anon(page_folio(page)); } static __always_inline int __PageMovable(struct page *page) @@ -536,30 +590,32 @@ static __always_inline int __PageMovable(struct page *page) * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ -static __always_inline int PageKsm(struct page *page) +static __always_inline bool folio_test_ksm(struct folio *folio) { - page = compound_head(page); - return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == + return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_KSM; } + +static __always_inline bool PageKsm(struct page *page) +{ + return folio_test_ksm(page_folio(page)); +} #else -TESTPAGEFLAG_FALSE(Ksm) +TESTPAGEFLAG_FALSE(Ksm, ksm) #endif u64 stable_page_flags(struct page *page); -static inline int PageUptodate(struct page *page) +static inline bool folio_test_uptodate(struct folio *folio) { - int ret; - page = compound_head(page); - ret = test_bit(PG_uptodate, &(page)->flags); + bool ret = test_bit(PG_uptodate, folio_flags(folio, 0)); /* - * Must ensure that the data we read out of the page is loaded - * _after_ we've loaded page->flags to check for PageUptodate. - * We can skip the barrier if the page is not uptodate, because + * Must ensure that the data we read out of the folio is loaded + * _after_ we've loaded folio->flags to check the uptodate bit. + * We can skip the barrier if the folio is not uptodate, because * we wouldn't be reading anything from it. * - * See SetPageUptodate() for the other side of the story. + * See folio_mark_uptodate() for the other side of the story. */ if (ret) smp_rmb(); @@ -567,23 +623,36 @@ static inline int PageUptodate(struct page *page) return ret; } -static __always_inline void __SetPageUptodate(struct page *page) +static inline int PageUptodate(struct page *page) +{ + return folio_test_uptodate(page_folio(page)); +} + +static __always_inline void __folio_mark_uptodate(struct folio *folio) { - VM_BUG_ON_PAGE(PageTail(page), page); smp_wmb(); - __set_bit(PG_uptodate, &page->flags); + __set_bit(PG_uptodate, folio_flags(folio, 0)); } -static __always_inline void SetPageUptodate(struct page *page) +static __always_inline void folio_mark_uptodate(struct folio *folio) { - VM_BUG_ON_PAGE(PageTail(page), page); /* * Memory barrier must be issued before setting the PG_uptodate bit, - * so that all previous stores issued in order to bring the page - * uptodate are actually visible before PageUptodate becomes true. + * so that all previous stores issued in order to bring the folio + * uptodate are actually visible before folio_test_uptodate becomes true. */ smp_wmb(); - set_bit(PG_uptodate, &page->flags); + set_bit(PG_uptodate, folio_flags(folio, 0)); +} + +static __always_inline void __SetPageUptodate(struct page *page) +{ + __folio_mark_uptodate((struct folio *)page); +} + +static __always_inline void SetPageUptodate(struct page *page) +{ + folio_mark_uptodate((struct folio *)page); } CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) @@ -608,6 +677,17 @@ static inline void set_page_writeback_keepwrite(struct page *page) __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +/* Whether there are one or multiple pages in a folio */ +static inline bool folio_test_single(struct folio *folio) +{ + return !folio_test_head(folio); +} + +static inline bool folio_test_multi(struct folio *folio) +{ + return folio_test_head(folio); +} + static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); @@ -631,12 +711,15 @@ static inline void ClearPageCompound(struct page *page) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); int PageHeadHuge(struct page *page); +static inline bool folio_test_hugetlb(struct folio *folio) +{ + return PageHeadHuge(&folio->page); +} #else -TESTPAGEFLAG_FALSE(Huge) -TESTPAGEFLAG_FALSE(HeadHuge) +TESTPAGEFLAG_FALSE(Huge, hugetlb) +TESTPAGEFLAG_FALSE(HeadHuge, headhuge) #endif - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for @@ -652,6 +735,11 @@ static inline int PageTransHuge(struct page *page) return PageHead(page); } +static inline bool folio_test_transhuge(struct folio *folio) +{ + return folio_test_head(folio); +} + /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known @@ -688,12 +776,12 @@ static inline int PageTransTail(struct page *page) PAGEFLAG(DoubleMap, double_map, PF_SECOND) TESTSCFLAG(DoubleMap, double_map, PF_SECOND) #else -TESTPAGEFLAG_FALSE(TransHuge) -TESTPAGEFLAG_FALSE(TransCompound) -TESTPAGEFLAG_FALSE(TransCompoundMap) -TESTPAGEFLAG_FALSE(TransTail) -PAGEFLAG_FALSE(DoubleMap) - TESTSCFLAG_FALSE(DoubleMap) +TESTPAGEFLAG_FALSE(TransHuge, transhuge) +TESTPAGEFLAG_FALSE(TransCompound, transcompound) +TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) +TESTPAGEFLAG_FALSE(TransTail, transtail) +PAGEFLAG_FALSE(DoubleMap, double_map) + TESTSCFLAG_FALSE(DoubleMap, double_map) #endif /* @@ -877,6 +965,11 @@ static inline int page_has_private(struct page *page) return !!(page->flags & PAGE_FLAGS_PRIVATE); } +static inline bool folio_has_private(struct folio *folio) +{ + return page_has_private(&folio->page); +} + #undef PF_ANY #undef PF_HEAD #undef PF_ONLY_HEAD -- cgit v1.2.3 From 889a3747b3b7661b089ba4eae081a3b6bb351a23 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 25 Feb 2021 09:47:41 -0500 Subject: mm/lru: Add folio LRU functions Handle arbitrary-order folios being added to the LRU. By definition, all pages being added to the LRU were already head or base pages, but call page_folio() on them anyway to get the type right and avoid the buried calls to compound_head(). Saves 783 bytes of kernel text; no functions grow. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Yu Zhao Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Kirill A. Shutemov Acked-by: Mike Rapoport Acked-by: Vlastimil Babka --- include/linux/mm_inline.h | 103 ++++++++++++++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 355ea1ee32bd..e2ec68b0515c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -6,27 +6,33 @@ #include /** - * page_is_file_lru - should the page be on a file LRU or anon LRU? - * @page: the page to test - * - * Returns 1 if @page is a regular filesystem backed page cache page or a lazily - * freed anonymous page (e.g. via MADV_FREE). Returns 0 if @page is a normal - * anonymous page, a tmpfs page or otherwise ram or swap backed page. Used by - * functions that manipulate the LRU lists, to sort a page onto the right LRU - * list. + * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? + * @folio: The folio to test. * * We would like to get this info without a page flag, but the state - * needs to survive until the page is last deleted from the LRU, which + * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. + * + * Return: An integer (not a boolean!) used to sort a folio onto the + * right LRU list and to account folios correctly. + * 1 if @folio is a regular filesystem backed page cache folio + * or a lazily freed anonymous folio (e.g. via MADV_FREE). + * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise + * ram or swap backed folio. */ +static inline int folio_is_file_lru(struct folio *folio) +{ + return !folio_test_swapbacked(folio); +} + static inline int page_is_file_lru(struct page *page) { - return !PageSwapBacked(page); + return folio_is_file_lru(page_folio(page)); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, - int nr_pages) + long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -39,69 +45,94 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, } /** - * __clear_page_lru_flags - clear page lru flags before releasing a page - * @page: the page that was on lru and now has a zero reference + * __folio_clear_lru_flags - Clear page lru flags before releasing a page. + * @folio: The folio that was on lru and now has a zero reference. */ -static __always_inline void __clear_page_lru_flags(struct page *page) +static __always_inline void __folio_clear_lru_flags(struct folio *folio) { - VM_BUG_ON_PAGE(!PageLRU(page), page); + VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); - __ClearPageLRU(page); + __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ - if (PageActive(page) && PageUnevictable(page)) + if (folio_test_active(folio) && folio_test_unevictable(folio)) return; - __ClearPageActive(page); - __ClearPageUnevictable(page); + __folio_clear_active(folio); + __folio_clear_unevictable(folio); +} + +static __always_inline void __clear_page_lru_flags(struct page *page) +{ + __folio_clear_lru_flags(page_folio(page)); } /** - * page_lru - which LRU list should a page be on? - * @page: the page to test + * folio_lru_list - Which LRU list should a folio be on? + * @folio: The folio to test. * - * Returns the LRU list a page should be on, as an index + * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ -static __always_inline enum lru_list page_lru(struct page *page) +static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; - VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); - if (PageUnevictable(page)) + if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; - lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; - if (PageActive(page)) + lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; + if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } +static __always_inline +void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) +{ + enum lru_list lru = folio_lru_list(folio); + + update_lru_size(lruvec, lru, folio_zonenum(folio), + folio_nr_pages(folio)); + list_add(&folio->lru, &lruvec->lists[lru]); +} + static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec) { - enum lru_list lru = page_lru(page); + lruvec_add_folio(lruvec, page_folio(page)); +} - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add(&page->lru, &lruvec->lists[lru]); +static __always_inline +void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) +{ + enum lru_list lru = folio_lru_list(folio); + + update_lru_size(lruvec, lru, folio_zonenum(folio), + folio_nr_pages(folio)); + list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void add_page_to_lru_list_tail(struct page *page, struct lruvec *lruvec) { - enum lru_list lru = page_lru(page); + lruvec_add_folio_tail(lruvec, page_folio(page)); +} - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add_tail(&page->lru, &lruvec->lists[lru]); +static __always_inline +void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) +{ + list_del(&folio->lru); + update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), + -folio_nr_pages(folio)); } static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec) { - list_del(&page->lru); - update_lru_size(lruvec, page_lru(page), page_zonenum(page), - -thp_nr_pages(page)); + lruvec_del_folio(lruvec, page_folio(page)); } #endif -- cgit v1.2.3 From 85d0a2ed3747da7f9aedacb72478bbedf06f9f2e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Jan 2021 10:04:40 -0500 Subject: mm: Handle per-folio private data Add folio_get_private() which mirrors page_private() -- ie folio private data is the same as page private data. The only difference is that these return a void * instead of an unsigned long, which matches the majority of users. Turn attach_page_private() into folio_attach_private() and reimplement attach_page_private() as a wrapper. No filesystem which uses page private data currently supports compound pages, so we're free to define the rules. attach_page_private() may only be called on a head page; if you want to add private data to a tail page, you can call set_page_private() directly (and shouldn't increment the page refcount! That should be done when adding private data to the head page / folio). This saves 813 bytes of text with the distro-derived config that I'm testing due to removing the calls to compound_head() in get_page() & put_page(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/mm_types.h | 11 +++++++++++ include/linux/pagemap.h | 48 +++++++++++++++++++++++++++++------------------- 2 files changed, 40 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6908186629b4..5ebcb86ac934 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -317,6 +317,12 @@ static inline atomic_t *compound_pincount_ptr(struct page *page) #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) +/* + * page_private can be used on tail pages. However, PagePrivate is only + * checked by the VM on the head page. So page_private on the tail pages + * should be used for data that's ancillary to the head page (eg attaching + * buffer heads to tail pages after attaching buffer heads to the head page) + */ #define page_private(page) ((page)->private) static inline void set_page_private(struct page *page, unsigned long private) @@ -324,6 +330,11 @@ static inline void set_page_private(struct page *page, unsigned long private) page->private = private; } +static inline void *folio_get_private(struct folio *folio) +{ + return folio->private; +} + struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f3ec26551082..b0f5bf1cb540 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -184,42 +184,52 @@ static inline bool page_cache_get_speculative(struct page *page) } /** - * attach_page_private - Attach private data to a page. - * @page: Page to attach data to. - * @data: Data to attach to page. + * folio_attach_private - Attach private data to a folio. + * @folio: Folio to attach data to. + * @data: Data to attach to folio. * - * Attaching private data to a page increments the page's reference count. - * The data must be detached before the page will be freed. + * Attaching private data to a folio increments the page's reference count. + * The data must be detached before the folio will be freed. */ -static inline void attach_page_private(struct page *page, void *data) +static inline void folio_attach_private(struct folio *folio, void *data) { - get_page(page); - set_page_private(page, (unsigned long)data); - SetPagePrivate(page); + folio_get(folio); + folio->private = data; + folio_set_private(folio); } /** - * detach_page_private - Detach private data from a page. - * @page: Page to detach data from. + * folio_detach_private - Detach private data from a folio. + * @folio: Folio to detach data from. * - * Removes the data that was previously attached to the page and decrements + * Removes the data that was previously attached to the folio and decrements * the refcount on the page. * - * Return: Data that was attached to the page. + * Return: Data that was attached to the folio. */ -static inline void *detach_page_private(struct page *page) +static inline void *folio_detach_private(struct folio *folio) { - void *data = (void *)page_private(page); + void *data = folio_get_private(folio); - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return NULL; - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); + folio_clear_private(folio); + folio->private = NULL; + folio_put(folio); return data; } +static inline void attach_page_private(struct page *page, void *data) +{ + folio_attach_private(page_folio(page), data); +} + +static inline void *detach_page_private(struct page *page) +{ + return folio_detach_private(page_folio(page)); +} + #ifdef CONFIG_NUMA extern struct page *__page_cache_alloc(gfp_t gfp); #else -- cgit v1.2.3 From 9257e15677384d1ccdfe0619c4455e34cb0342c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Jan 2021 23:39:21 -0500 Subject: mm/filemap: Add folio_index(), folio_file_page() and folio_contains() folio_index() is the equivalent of page_index() for folios. folio_file_page() is the equivalent of find_subpage(). folio_contains() is the equivalent of thp_contains(). No changes to generated code. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/pagemap.h | 56 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b0f5bf1cb540..e7b46223239a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -386,6 +386,62 @@ static inline bool thp_contains(struct page *head, pgoff_t index) return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL)); } +#define swapcache_index(folio) __page_file_index(&(folio)->page) + +/** + * folio_index - File index of a folio. + * @folio: The folio. + * + * For a folio which is either in the page cache or the swap cache, + * return its index within the address_space it belongs to. If you know + * the page is definitely in the page cache, you can look at the folio's + * index directly. + * + * Return: The index (offset in units of pages) of a folio in its file. + */ +static inline pgoff_t folio_index(struct folio *folio) +{ + if (unlikely(folio_test_swapcache(folio))) + return swapcache_index(folio); + return folio->index; +} + +/** + * folio_file_page - The page for a particular index. + * @folio: The folio which contains this index. + * @index: The index we want to look up. + * + * Sometimes after looking up a folio in the page cache, we need to + * obtain the specific page for an index (eg a page fault). + * + * Return: The page containing the file data for this index. + */ +static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) +{ + /* HugeTLBfs indexes the page cache in units of hpage_size */ + if (folio_test_hugetlb(folio)) + return &folio->page; + return folio_page(folio, index & (folio_nr_pages(folio) - 1)); +} + +/** + * folio_contains - Does this folio contain this index? + * @folio: The folio. + * @index: The page index within the file. + * + * Context: The caller should have the page locked in order to prevent + * (eg) shmem from moving the page between the page cache and swap cache + * and changing its index in the middle of the operation. + * Return: true or false. + */ +static inline bool folio_contains(struct folio *folio, pgoff_t index) +{ + /* HugeTLBfs indexes the page cache in units of hpage_size */ + if (folio_test_hugetlb(folio)) + return folio->index == index; + return index - folio_index(folio) < folio_nr_pages(folio); +} + /* * Given the page we found in the page cache, return the page corresponding * to this index in the file -- cgit v1.2.3 From f94b18f6653ab6d3aa503213c2e1e79b18f05a30 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 21 Mar 2021 16:24:31 -0400 Subject: mm/filemap: Add folio_next_index() This helper returns the page index of the next folio in the file (ie the end of this folio, plus one). No changes to generated code. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/pagemap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e7b46223239a..95e83a9e73c7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -406,6 +406,17 @@ static inline pgoff_t folio_index(struct folio *folio) return folio->index; } +/** + * folio_next_index - Get the index of the next folio. + * @folio: The current folio. + * + * Return: The index of the folio which follows this folio in the file. + */ +static inline pgoff_t folio_next_index(struct folio *folio) +{ + return folio->index + folio_nr_pages(folio); +} + /** * folio_file_page - The page for a particular index. * @folio: The folio which contains this index. -- cgit v1.2.3 From 352b47a6984470954e1e262126b1da5312c40b09 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Dec 2020 07:25:19 -0500 Subject: mm/filemap: Add folio_pos() and folio_file_pos() These are just wrappers around page_offset() and page_file_offset() respectively. No change to generated code. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells --- include/linux/pagemap.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 95e83a9e73c7..3354117a1dae 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -561,6 +561,27 @@ static inline loff_t page_file_offset(struct page *page) return ((loff_t)page_index(page)) << PAGE_SHIFT; } +/** + * folio_pos - Returns the byte position of this folio in its file. + * @folio: The folio. + */ +static inline loff_t folio_pos(struct folio *folio) +{ + return page_offset(&folio->page); +} + +/** + * folio_file_pos - Returns the byte position of this folio in its file. + * @folio: The folio. + * + * This differs from folio_pos() for folios which belong to a swap file. + * NFS is the only filesystem today which needs to use folio_file_pos(). + */ +static inline loff_t folio_file_pos(struct folio *folio) +{ + return page_file_offset(&folio->page); +} + extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, unsigned long address); -- cgit v1.2.3 From 2f52578f9c64d7d9a96ab81c243cc20804fabf2b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 10 Dec 2020 10:55:05 -0500 Subject: mm/util: Add folio_mapping() and folio_file_mapping() These are the folio equivalent of page_mapping() and page_file_mapping(). Add an out-of-line page_mapping() wrapper around folio_mapping() in order to prevent the page_folio() call from bloating every caller of page_mapping(). Adjust page_file_mapping() and page_mapping_file() to use folios internally. Rename __page_file_mapping() to swapcache_mapping() and change it to take a folio. This ends up saving 122 bytes of text overall. folio_mapping() is 45 bytes shorter than page_mapping() was, but the new page_mapping() wrapper is 30 bytes. The major reduction is a few bytes less in dozens of nfs functions (which call page_file_mapping()). Most of these appear to be a slight change in gcc's register allocation decisions, which allow: 48 8b 56 08 mov 0x8(%rsi),%rdx 48 8d 42 ff lea -0x1(%rdx),%rax 83 e2 01 and $0x1,%edx 48 0f 44 c6 cmove %rsi,%rax to become: 48 8b 46 08 mov 0x8(%rsi),%rax 48 8d 78 ff lea -0x1(%rax),%rdi a8 01 test $0x1,%al 48 0f 44 fe cmove %rsi,%rdi for a reduction of a single byte. Once the NFS client is converted to use folios, this entire sequence will disappear. Also add folio_mapping() documentation. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells --- include/linux/mm.h | 14 -------------- include/linux/pagemap.h | 35 +++++++++++++++++++++++++++++++++-- include/linux/swap.h | 6 ++++++ 3 files changed, 39 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 63685d953ce0..bc5c38e1f780 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1755,19 +1755,6 @@ void page_address_init(void); extern void *page_rmapping(struct page *page); extern struct anon_vma *page_anon_vma(struct page *page); -extern struct address_space *page_mapping(struct page *page); - -extern struct address_space *__page_file_mapping(struct page *); - -static inline -struct address_space *page_file_mapping(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return __page_file_mapping(page); - - return page->mapping; -} - extern pgoff_t __page_file_index(struct page *page); /* @@ -1782,7 +1769,6 @@ static inline pgoff_t page_index(struct page *page) } bool page_mapped(struct page *page); -struct address_space *page_mapping(struct page *page); /* * Return true only if the page has been allocated with diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3354117a1dae..0ca96a40fabe 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -162,14 +162,45 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) void release_pages(struct page **pages, int nr); +struct address_space *page_mapping(struct page *); +struct address_space *folio_mapping(struct folio *); +struct address_space *swapcache_mapping(struct folio *); + +/** + * folio_file_mapping - Find the mapping this folio belongs to. + * @folio: The folio. + * + * For folios which are in the page cache, return the mapping that this + * page belongs to. Folios in the swap cache return the mapping of the + * swap file or swap device where the data is stored. This is different + * from the mapping returned by folio_mapping(). The only reason to + * use it is if, like NFS, you return 0 from ->activate_swapfile. + * + * Do not call this for folios which aren't in the page cache or swap cache. + */ +static inline struct address_space *folio_file_mapping(struct folio *folio) +{ + if (unlikely(folio_test_swapcache(folio))) + return swapcache_mapping(folio); + + return folio->mapping; +} + +static inline struct address_space *page_file_mapping(struct page *page) +{ + return folio_file_mapping(page_folio(page)); +} + /* * For file cache pages, return the address_space, otherwise return NULL */ static inline struct address_space *page_mapping_file(struct page *page) { - if (unlikely(PageSwapCache(page))) + struct folio *folio = page_folio(page); + + if (unlikely(folio_test_swapcache(folio))) return NULL; - return page_mapping(page); + return folio_mapping(folio); } static inline bool page_cache_add_speculative(struct page *page, int count) diff --git a/include/linux/swap.h b/include/linux/swap.h index ba52f3a3478e..85607c6c0cba 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -320,6 +320,12 @@ struct vma_swap_readahead { #endif }; +static inline swp_entry_t folio_swap_entry(struct folio *folio) +{ + swp_entry_t entry = { .val = page_private(&folio->page) }; + return entry; +} + /* linux/mm/workingset.c */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); -- cgit v1.2.3 From 4e1364286d0a2dd384bceb6db6185b99c0e2c0bc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 7 Dec 2020 15:44:35 -0500 Subject: mm/filemap: Add folio_unlock() Convert unlock_page() to call folio_unlock(). By using a folio we avoid a call to compound_head(). This shortens the function from 39 bytes to 25 and removes 4 instructions on x86-64. Because we still have unlock_page(), it's a net increase of 16 bytes of text for the kernel as a whole, but any path that uses folio_unlock() will execute 4 fewer instructions. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport Acked-by: Vlastimil Babka --- include/linux/pagemap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0ca96a40fabe..8087921641a3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -657,7 +657,8 @@ extern int __lock_page_killable(struct page *page); extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); -extern void unlock_page(struct page *page); +void unlock_page(struct page *page); +void folio_unlock(struct folio *folio); /* * Return true if the page was successfully locked -- cgit v1.2.3 From 7c23c782d5d57df97509ed2fc17f9b9490f18f1b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Mar 2021 19:38:25 -0500 Subject: mm/filemap: Add folio_lock() This is like lock_page() but for use by callers who know they have a folio. Convert __lock_page() to be __folio_lock(). This saves one call to compound_head() per contended call to lock_page(). Saves 455 bytes of text; mostly from improved register allocation and inlining decisions. __folio_lock is 59 bytes while __lock_page was 79. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/pagemap.h | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8087921641a3..6481a431ea40 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -652,7 +652,7 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page, return true; } -extern void __lock_page(struct page *page); +void __folio_lock(struct folio *folio); extern int __lock_page_killable(struct page *page); extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, @@ -660,13 +660,24 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, void unlock_page(struct page *page); void folio_unlock(struct folio *folio); +static inline bool folio_trylock(struct folio *folio) +{ + return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0))); +} + /* * Return true if the page was successfully locked */ static inline int trylock_page(struct page *page) { - page = compound_head(page); - return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); + return folio_trylock(page_folio(page)); +} + +static inline void folio_lock(struct folio *folio) +{ + might_sleep(); + if (!folio_trylock(folio)) + __folio_lock(folio); } /* @@ -674,9 +685,12 @@ static inline int trylock_page(struct page *page) */ static inline void lock_page(struct page *page) { + struct folio *folio; might_sleep(); - if (!trylock_page(page)) - __lock_page(page); + + folio = page_folio(page); + if (!folio_trylock(folio)) + __folio_lock(folio); } /* -- cgit v1.2.3 From af7f29d9e1a7bda1429923327421367b69aa2e70 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 8 Dec 2020 00:07:31 -0500 Subject: mm/filemap: Add folio_lock_killable() This is like lock_page_killable() but for use by callers who know they have a folio. Convert __lock_page_killable() to be __folio_lock_killable(). This saves one call to compound_head() per contended call to lock_page_killable(). __folio_lock_killable() is 19 bytes smaller than __lock_page_killable() was. filemap_fault() shrinks by 74 bytes and __lock_page_or_retry() shrinks by 71 bytes. That's a total of 164 bytes of text saved. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Acked-by: Mike Rapoport Reviewed-by: David Howells --- include/linux/pagemap.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 6481a431ea40..cf0ebd8c9e86 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -653,7 +653,7 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page, } void __folio_lock(struct folio *folio); -extern int __lock_page_killable(struct page *page); +int __folio_lock_killable(struct folio *folio); extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); @@ -693,6 +693,14 @@ static inline void lock_page(struct page *page) __folio_lock(folio); } +static inline int folio_lock_killable(struct folio *folio) +{ + might_sleep(); + if (!folio_trylock(folio)) + return __folio_lock_killable(folio); + return 0; +} + /* * lock_page_killable is like lock_page but can be interrupted by fatal * signals. It returns 0 if it locked the page and -EINTR if it was @@ -700,10 +708,7 @@ static inline void lock_page(struct page *page) */ static inline int lock_page_killable(struct page *page) { - might_sleep(); - if (!trylock_page(page)) - return __lock_page_killable(page); - return 0; + return folio_lock_killable(page_folio(page)); } /* -- cgit v1.2.3 From ffdc8dabf20b1b894eda63e7ec9ca15ab0b7292c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 30 Dec 2020 17:58:40 -0500 Subject: mm/filemap: Add __folio_lock_async() There aren't any actual callers of lock_page_async(), so remove it. Convert filemap_update_page() to call __folio_lock_async(). __folio_lock_async() is 21 bytes smaller than __lock_page_async(), but the real savings come from using a folio in filemap_update_page(), shrinking it from 515 bytes to 404 bytes, saving 110 bytes. The text shrinks by 132 bytes in total. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/pagemap.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cf0ebd8c9e86..5b5e8bd0b3fb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -654,7 +654,6 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page, void __folio_lock(struct folio *folio); int __folio_lock_killable(struct folio *folio); -extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); void unlock_page(struct page *page); @@ -711,22 +710,6 @@ static inline int lock_page_killable(struct page *page) return folio_lock_killable(page_folio(page)); } -/* - * lock_page_async - Lock the page, unless this would block. If the page - * is already locked, then queue a callback when the page becomes unlocked. - * This callback can then retry the operation. - * - * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page - * was already locked and the callback defined in 'wait' was queued. - */ -static inline int lock_page_async(struct page *page, - struct wait_page_queue *wait) -{ - if (!trylock_page(page)) - return __lock_page_async(page, wait); - return 0; -} - /* * lock_page_or_retry - Lock the page, unless this would block and the * caller indicated that it can handle a retry. -- cgit v1.2.3 From 6baa8d602e84d97a7541ed94ccaeb6a3f9763111 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 4 Mar 2021 10:21:02 -0500 Subject: mm/filemap: Add folio_wait_locked() Also add folio_wait_locked_killable(). Turn wait_on_page_locked() and wait_on_page_locked_killable() into wrappers. This eliminates a call to compound_head() from each call-site, reducing text size by 193 bytes for me. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/pagemap.h | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 5b5e8bd0b3fb..77dbb7f625af 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -732,23 +732,33 @@ extern void wait_on_page_bit(struct page *page, int bit_nr); extern int wait_on_page_bit_killable(struct page *page, int bit_nr); /* - * Wait for a page to be unlocked. + * Wait for a folio to be unlocked. * - * This must be called with the caller "holding" the page, - * ie with increased "page->count" so that the page won't + * This must be called with the caller "holding" the folio, + * ie with increased "page->count" so that the folio won't * go away during the wait.. */ +static inline void folio_wait_locked(struct folio *folio) +{ + if (folio_test_locked(folio)) + wait_on_page_bit(&folio->page, PG_locked); +} + +static inline int folio_wait_locked_killable(struct folio *folio) +{ + if (!folio_test_locked(folio)) + return 0; + return wait_on_page_bit_killable(&folio->page, PG_locked); +} + static inline void wait_on_page_locked(struct page *page) { - if (PageLocked(page)) - wait_on_page_bit(compound_head(page), PG_locked); + folio_wait_locked(page_folio(page)); } static inline int wait_on_page_locked_killable(struct page *page) { - if (!PageLocked(page)) - return 0; - return wait_on_page_bit_killable(compound_head(page), PG_locked); + return folio_wait_locked_killable(page_folio(page)); } int put_and_wait_on_page_locked(struct page *page, int state); -- cgit v1.2.3 From 9138e47ed425246102317dbe452f7f0d4a54c4a2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 18 Mar 2021 21:39:45 -0400 Subject: mm/filemap: Add __folio_lock_or_retry() Convert __lock_page_or_retry() to __folio_lock_or_retry(). This actually saves 4 bytes in the only caller of lock_page_or_retry() (due to better register allocation) and saves the 14 byte cost of calling page_folio() in __folio_lock_or_retry() for a total saving of 18 bytes. Also use a bool for the return type. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Reviewed-by: William Kucharski Acked-by: Mike Rapoport Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/pagemap.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 77dbb7f625af..7cf140f98910 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -654,7 +654,7 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page, void __folio_lock(struct folio *folio); int __folio_lock_killable(struct folio *folio); -extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, +bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, unsigned int flags); void unlock_page(struct page *page); void folio_unlock(struct folio *folio); @@ -715,13 +715,16 @@ static inline int lock_page_killable(struct page *page) * caller indicated that it can handle a retry. * * Return value and mmap_lock implications depend on flags; see - * __lock_page_or_retry(). + * __folio_lock_or_retry(). */ -static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, +static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { + struct folio *folio; might_sleep(); - return trylock_page(page) || __lock_page_or_retry(page, mm, flags); + + folio = page_folio(page); + return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags); } /* -- cgit v1.2.3 From 575ced1c8b0d3b578b933a68ce67ddaff3df9506 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 8 Dec 2020 01:25:39 -0500 Subject: mm/swap: Add folio_rotate_reclaimable() Convert rotate_reclaimable_page() to folio_rotate_reclaimable(). This eliminates all five of the calls to compound_head() in this function, saving 75 bytes at the cost of adding 15 bytes to its one caller, end_page_writeback(). We also save 36 bytes from pagevec_move_tail_fn() due to using folios there. Net 96 bytes savings. Also move its declaration to mm/internal.h as it's only used by filemap.c. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Acked-by: Kirill A. Shutemov Acked-by: Mike Rapoport Reviewed-by: David Howells --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 85607c6c0cba..c7ecd3ad8e2e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -371,7 +371,6 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); -extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); extern void deactivate_page(struct page *page); extern void mark_page_lazyfree(struct page *page); -- cgit v1.2.3 From 4268b48077e55a93959f368aa9d3103ede5d3f0f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 3 Mar 2021 15:21:55 -0500 Subject: mm/filemap: Add folio_end_writeback() Add an end_page_writeback() wrapper function for users that are not yet converted to folios. folio_end_writeback() is less than half the size of end_page_writeback() at just 105 bytes compared to 228 bytes, due to removing all the compound_head() calls. The 30 byte wrapper function makes this a net saving of 93 bytes. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/pagemap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7cf140f98910..d9dcd335cc18 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -767,7 +767,8 @@ static inline int wait_on_page_locked_killable(struct page *page) int put_and_wait_on_page_locked(struct page *page, int state); void wait_on_page_writeback(struct page *page); int wait_on_page_writeback_killable(struct page *page); -extern void end_page_writeback(struct page *page); +void end_page_writeback(struct page *page); +void folio_end_writeback(struct folio *folio); void wait_for_stable_page(struct page *page); void __set_page_dirty(struct page *, struct address_space *, int warn); -- cgit v1.2.3 From 490e016f229a79dc7551e7f0e989d2304416c189 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 4 Mar 2021 11:09:17 -0500 Subject: mm/writeback: Add folio_wait_writeback() wait_on_page_writeback_killable() only has one caller, so convert it to call folio_wait_writeback_killable(). For the wait_on_page_writeback() callers, add a compatibility wrapper around folio_wait_writeback(). Turning PageWriteback() into folio_test_writeback() eliminates a call to compound_head() which saves 8 bytes and 15 bytes in the two functions. Unfortunately, that is more than offset by adding the wait_on_page_writeback compatibility wrapper for a net increase in text of 7 bytes. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Acked-by: Mike Rapoport Reviewed-by: David Howells --- include/linux/pagemap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d9dcd335cc18..e6013c063986 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -766,7 +766,8 @@ static inline int wait_on_page_locked_killable(struct page *page) int put_and_wait_on_page_locked(struct page *page, int state); void wait_on_page_writeback(struct page *page); -int wait_on_page_writeback_killable(struct page *page); +void folio_wait_writeback(struct folio *folio); +int folio_wait_writeback_killable(struct folio *folio); void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); void wait_for_stable_page(struct page *page); -- cgit v1.2.3 From a49d0c507759214a7cfd26555382c314db486792 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 4 Mar 2021 11:25:25 -0500 Subject: mm/writeback: Add folio_wait_stable() Move wait_for_stable_page() into the folio compatibility file. folio_wait_stable() avoids a call to compound_head() and is 14 bytes smaller than wait_for_stable_page() was. The net text size grows by 16 bytes as a result of this patch. We can also remove thp_head() as this was the last user. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells --- include/linux/huge_mm.h | 15 --------------- include/linux/pagemap.h | 1 + 2 files changed, 1 insertion(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f123e15d966e..f280f33ff223 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -250,15 +250,6 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, return NULL; } -/** - * thp_head - Head page of a transparent huge page. - * @page: Any page (tail, head or regular) found in the page cache. - */ -static inline struct page *thp_head(struct page *page) -{ - return compound_head(page); -} - /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. @@ -336,12 +327,6 @@ static inline struct list_head *page_deferred_list(struct page *page) #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) -static inline struct page *thp_head(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - return page; -} - static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e6013c063986..ee39ad7b42f1 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -771,6 +771,7 @@ int folio_wait_writeback_killable(struct folio *folio); void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); void wait_for_stable_page(struct page *page); +void folio_wait_stable(struct folio *folio); void __set_page_dirty(struct page *, struct address_space *, int warn); int __set_page_dirty_nobuffers(struct page *page); -- cgit v1.2.3 From 101c0bf67f50ca0e8b9da97b26f8dc7cb232b4d3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 4 Mar 2021 12:02:54 -0500 Subject: mm/filemap: Add folio_wait_bit() Rename wait_on_page_bit() to folio_wait_bit(). We must always wait on the folio, otherwise we won't be woken up due to the tail page hashing to a different bucket from the head page. This commit shrinks the kernel by 770 bytes, mostly due to moving the page waitqueue lookup into folio_wait_bit_common(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells Acked-by: Mike Rapoport --- include/linux/pagemap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ee39ad7b42f1..2f481327dee8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -728,11 +728,11 @@ static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm, } /* - * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc., + * This is exported only for folio_wait_locked/folio_wait_writeback, etc., * and should not be used directly. */ -extern void wait_on_page_bit(struct page *page, int bit_nr); -extern int wait_on_page_bit_killable(struct page *page, int bit_nr); +void folio_wait_bit(struct folio *folio, int bit_nr); +int folio_wait_bit_killable(struct folio *folio, int bit_nr); /* * Wait for a folio to be unlocked. @@ -744,14 +744,14 @@ extern int wait_on_page_bit_killable(struct page *page, int bit_nr); static inline void folio_wait_locked(struct folio *folio) { if (folio_test_locked(folio)) - wait_on_page_bit(&folio->page, PG_locked); + folio_wait_bit(folio, PG_locked); } static inline int folio_wait_locked_killable(struct folio *folio) { if (!folio_test_locked(folio)) return 0; - return wait_on_page_bit_killable(&folio->page, PG_locked); + return folio_wait_bit_killable(folio, PG_locked); } static inline void wait_on_page_locked(struct page *page) -- cgit v1.2.3 From df4d4f12739495332e0d1f916ef4270f7d25d207 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 16 Jan 2021 11:22:14 -0500 Subject: mm/filemap: Convert page wait queues to be folios Reinforce that page flags are actually in the head page by changing the type from page to folio. Increases the size of cachefiles by two bytes, but the kernel core is unchanged in size. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Jeff Layton Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: David Howells --- include/linux/pagemap.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2f481327dee8..ebc62e9e453b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -628,13 +628,13 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, } struct wait_page_key { - struct page *page; + struct folio *folio; int bit_nr; int page_match; }; struct wait_page_queue { - struct page *page; + struct folio *folio; int bit_nr; wait_queue_entry_t wait; }; @@ -642,7 +642,7 @@ struct wait_page_queue { static inline bool wake_page_match(struct wait_page_queue *wait_page, struct wait_page_key *key) { - if (wait_page->page != key->page) + if (wait_page->folio != key->folio) return false; key->page_match = 1; @@ -802,7 +802,7 @@ int wait_on_page_private_2_killable(struct page *page); /* * Add an arbitrary waiter to a page's wait queue */ -extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); +void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter); /* * Fault everything in given userspace address range in. -- cgit v1.2.3 From b47393f8448ade8bafe09ed302ce2a15093e9718 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 22 Apr 2021 22:58:32 -0400 Subject: mm/filemap: Add folio private_2 functions end_page_private_2() becomes folio_end_private_2(), wait_on_page_private_2() becomes folio_wait_private_2() and wait_on_page_private_2_killable() becomes folio_wait_private_2_killable(). Adjust the fscache equivalents to call page_folio() before calling these functions to avoid adding wrappers. Ends up costing 1 byte of text in ceph & netfs, but the core shrinks by three calls to page_folio(). Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Kirill A. Shutemov --- include/linux/netfs.h | 6 +++--- include/linux/pagemap.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5d6a4158a9a6..3d4cbf2f7dc4 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -55,7 +55,7 @@ static inline void set_page_fscache(struct page *page) */ static inline void end_page_fscache(struct page *page) { - end_page_private_2(page); + folio_end_private_2(page_folio(page)); } /** @@ -66,7 +66,7 @@ static inline void end_page_fscache(struct page *page) */ static inline void wait_on_page_fscache(struct page *page) { - wait_on_page_private_2(page); + folio_wait_private_2(page_folio(page)); } /** @@ -82,7 +82,7 @@ static inline void wait_on_page_fscache(struct page *page) */ static inline int wait_on_page_fscache_killable(struct page *page) { - return wait_on_page_private_2_killable(page); + return folio_wait_private_2_killable(page_folio(page)); } enum netfs_read_source { diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ebc62e9e453b..05f91bfc048d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -795,9 +795,9 @@ static inline void set_page_private_2(struct page *page) SetPagePrivate2(page); } -void end_page_private_2(struct page *page); -void wait_on_page_private_2(struct page *page); -int wait_on_page_private_2_killable(struct page *page); +void folio_end_private_2(struct folio *folio); +void folio_wait_private_2(struct folio *folio); +int folio_wait_private_2_killable(struct folio *folio); /* * Add an arbitrary waiter to a page's wait queue -- cgit v1.2.3 From 6abbaa5b01730a1f0883d199cf5a90ae5c5dccea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Apr 2021 14:24:30 -0400 Subject: fs/netfs: Add folio fscache functions Match the page writeback functions by adding folio_start_fscache(), folio_end_fscache(), folio_wait_fscache() and folio_wait_fscache_killable(). Remove set_page_private_2(). Also rewrite the kernel-doc to describe when to use the function rather than what the function does, and include the kernel-doc in the appropriate rst file. Saves 31 bytes of text in netfs_rreq_unlock() due to set_page_fscache() calling page_folio() once instead of three times. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Acked-by: Kirill A. Shutemov Acked-by: Mike Rapoport Reviewed-by: David Howells --- include/linux/netfs.h | 75 +++++++++++++++++++++++++++++++------------------ include/linux/pagemap.h | 16 ----------- 2 files changed, 48 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 3d4cbf2f7dc4..12c4177f7703 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -22,6 +22,7 @@ * Overload PG_private_2 to give us PG_fscache - this is used to indicate that * a page is currently backed by a local disk cache */ +#define folio_test_fscache(folio) folio_test_private_2(folio) #define PageFsCache(page) PagePrivate2((page)) #define SetPageFsCache(page) SetPagePrivate2((page)) #define ClearPageFsCache(page) ClearPagePrivate2((page)) @@ -29,57 +30,77 @@ #define TestClearPageFsCache(page) TestClearPagePrivate2((page)) /** - * set_page_fscache - Set PG_fscache on a page and take a ref - * @page: The page. + * folio_start_fscache - Start an fscache write on a folio. + * @folio: The folio. * - * Set the PG_fscache (PG_private_2) flag on a page and take the reference - * needed for the VM to handle its lifetime correctly. This sets the flag and - * takes the reference unconditionally, so care must be taken not to set the - * flag again if it's already set. + * Call this function before writing a folio to a local cache. Starting a + * second write before the first one finishes is not allowed. */ -static inline void set_page_fscache(struct page *page) +static inline void folio_start_fscache(struct folio *folio) { - set_page_private_2(page); + VM_BUG_ON_FOLIO(folio_test_private_2(folio), folio); + folio_get(folio); + folio_set_private_2(folio); } /** - * end_page_fscache - Clear PG_fscache and release any waiters - * @page: The page - * - * Clear the PG_fscache (PG_private_2) bit on a page and wake up any sleepers - * waiting for this. The page ref held for PG_private_2 being set is released. + * folio_end_fscache - End an fscache write on a folio. + * @folio: The folio. * - * This is, for example, used when a netfs page is being written to a local - * disk cache, thereby allowing writes to the cache for the same page to be - * serialised. + * Call this function after the folio has been written to the local cache. + * This will wake any sleepers waiting on this folio. */ -static inline void end_page_fscache(struct page *page) +static inline void folio_end_fscache(struct folio *folio) { - folio_end_private_2(page_folio(page)); + folio_end_private_2(folio); } /** - * wait_on_page_fscache - Wait for PG_fscache to be cleared on a page - * @page: The page to wait on + * folio_wait_fscache - Wait for an fscache write on this folio to end. + * @folio: The folio. * - * Wait for PG_fscache (aka PG_private_2) to be cleared on a page. + * If this folio is currently being written to a local cache, wait for + * the write to finish. Another write may start after this one finishes, + * unless the caller holds the folio lock. */ -static inline void wait_on_page_fscache(struct page *page) +static inline void folio_wait_fscache(struct folio *folio) { - folio_wait_private_2(page_folio(page)); + folio_wait_private_2(folio); } /** - * wait_on_page_fscache_killable - Wait for PG_fscache to be cleared on a page - * @page: The page to wait on + * folio_wait_fscache_killable - Wait for an fscache write on this folio to end. + * @folio: The folio. * - * Wait for PG_fscache (aka PG_private_2) to be cleared on a page or until a - * fatal signal is received by the calling task. + * If this folio is currently being written to a local cache, wait + * for the write to finish or for a fatal signal to be received. + * Another write may start after this one finishes, unless the caller + * holds the folio lock. * * Return: * - 0 if successful. * - -EINTR if a fatal signal was encountered. */ +static inline int folio_wait_fscache_killable(struct folio *folio) +{ + return folio_wait_private_2_killable(folio); +} + +static inline void set_page_fscache(struct page *page) +{ + folio_start_fscache(page_folio(page)); +} + +static inline void end_page_fscache(struct page *page) +{ + folio_end_private_2(page_folio(page)); +} + +static inline void wait_on_page_fscache(struct page *page) +{ + folio_wait_private_2(page_folio(page)); +} + static inline int wait_on_page_fscache_killable(struct page *page) { return folio_wait_private_2_killable(page_folio(page)); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 05f91bfc048d..bdbd7be67812 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -779,22 +779,6 @@ int __set_page_dirty_no_writeback(struct page *page); void page_endio(struct page *page, bool is_write, int err); -/** - * set_page_private_2 - Set PG_private_2 on a page and take a ref - * @page: The page. - * - * Set the PG_private_2 flag on a page and take the reference needed for the VM - * to handle its lifetime correctly. This sets the flag and takes the - * reference unconditionally, so care must be taken not to set the flag again - * if it's already set. - */ -static inline void set_page_private_2(struct page *page) -{ - page = compound_head(page); - get_page(page); - SetPagePrivate2(page); -} - void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); -- cgit v1.2.3 From dd10ab049beb479dc83bb14a7b5cd68c363983ce Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 12 Apr 2021 16:45:17 -0400 Subject: mm: Add folio_mapped() This function is the equivalent of page_mapped(). It is slightly shorter as we do not need to handle the PageTail() case. Reimplement page_mapped() as a wrapper around folio_mapped(). folio_mapped() is 13 bytes smaller than page_mapped(), but the page_mapped() wrapper is 30 bytes, for a net increase of 17 bytes of text. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Kirill A. Shutemov Acked-by: Mike Rapoport --- include/linux/mm.h | 1 + include/linux/mm_types.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index bc5c38e1f780..95e36d0475ac 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1769,6 +1769,7 @@ static inline pgoff_t page_index(struct page *page) } bool page_mapped(struct page *page); +bool folio_mapped(struct folio *folio); /* * Return true only if the page has been allocated with diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5ebcb86ac934..82dab23205c3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -299,6 +299,12 @@ FOLIO_MATCH(memcg_data, memcg_data); #endif #undef FOLIO_MATCH +static inline atomic_t *folio_mapcount_ptr(struct folio *folio) +{ + struct page *tail = &folio->page + 1; + return &tail->compound_mapcount; +} + static inline atomic_t *compound_mapcount_ptr(struct page *page) { return &page[1].compound_mapcount; -- cgit v1.2.3 From 874fd90cafdca9d678bceb88f91322af8c9a9d2d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 25 Jun 2021 09:27:29 -0400 Subject: mm: Add folio_nid() This is the folio equivalent of page_to_nid(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Mike Rapoport Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 95e36d0475ac..04e41d4d85ea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1429,6 +1429,11 @@ static inline int page_to_nid(const struct page *page) } #endif +static inline int folio_nid(const struct folio *folio) +{ + return page_to_nid(&folio->page); +} + #ifdef CONFIG_NUMA_BALANCING static inline int cpu_pid_to_cpupid(int cpu, int pid) { -- cgit v1.2.3 From 1b7e4464d43a488e383843bf96ec62d12393bff1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 14:59:26 -0400 Subject: mm/memcg: Add folio_memcg() and related functions memcg information is only stored in the head page, so the memcg subsystem needs to assure that all accesses are to the head page. The first step is converting page_memcg() to folio_memcg(). The callers of page_memcg() and PageMemcgKmem() are not yet ready to be converted to use folios, so retain them as wrappers around folio_memcg() and folio_memcg_kmem(). They will be converted in a later patch set. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/memcontrol.h | 110 ++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3096c9a0ee01..06659670db32 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -369,7 +369,7 @@ enum page_memcg_data_flags { #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) -static inline bool PageMemcgKmem(struct page *page); +static inline bool folio_memcg_kmem(struct folio *folio); /* * After the initialization objcg->memcg is always pointing at @@ -384,73 +384,77 @@ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) } /* - * __page_memcg - get the memory cgroup associated with a non-kmem page - * @page: a pointer to the page struct + * __folio_memcg - Get the memory cgroup associated with a non-kmem folio + * @folio: Pointer to the folio. * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a + * Returns a pointer to the memory cgroup associated with the folio, + * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages or - * kmem pages. + * against some type of folios, e.g. slab folios or ex-slab folios or + * kmem folios. */ -static inline struct mem_cgroup *__page_memcg(struct page *page) +static inline struct mem_cgroup *__folio_memcg(struct folio *folio) { - unsigned long memcg_data = page->memcg_data; + unsigned long memcg_data = folio->memcg_data; - VM_BUG_ON_PAGE(PageSlab(page), page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, page); + VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* - * __page_objcg - get the object cgroup associated with a kmem page - * @page: a pointer to the page struct + * __folio_objcg - get the object cgroup associated with a kmem folio. + * @folio: Pointer to the folio. * - * Returns a pointer to the object cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a + * Returns a pointer to the object cgroup associated with the folio, + * or NULL. This function assumes that the folio is known to have a * proper object cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages or - * LRU pages. + * against some type of folios, e.g. slab folios or ex-slab folios or + * LRU folios. */ -static inline struct obj_cgroup *__page_objcg(struct page *page) +static inline struct obj_cgroup *__folio_objcg(struct folio *folio) { - unsigned long memcg_data = page->memcg_data; + unsigned long memcg_data = folio->memcg_data; - VM_BUG_ON_PAGE(PageSlab(page), page); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page); - VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page); + VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); + VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } /* - * page_memcg - get the memory cgroup associated with a page - * @page: a pointer to the page struct + * folio_memcg - Get the memory cgroup associated with a folio. + * @folio: Pointer to the folio. * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a + * Returns a pointer to the memory cgroup associated with the folio, + * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages. + * against some type of folios, e.g. slab folios or ex-slab folios. * - * For a non-kmem page any of the following ensures page and memcg binding + * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * - * - the page lock + * - the folio lock * - LRU isolation * - lock_page_memcg() * - exclusive reference * - * For a kmem page a caller should hold an rcu read lock to protect memcg - * associated with a kmem page from being released. + * For a kmem folio a caller should hold an rcu read lock to protect memcg + * associated with a kmem folio from being released. */ +static inline struct mem_cgroup *folio_memcg(struct folio *folio) +{ + if (folio_memcg_kmem(folio)) + return obj_cgroup_memcg(__folio_objcg(folio)); + return __folio_memcg(folio); +} + static inline struct mem_cgroup *page_memcg(struct page *page) { - if (PageMemcgKmem(page)) - return obj_cgroup_memcg(__page_objcg(page)); - else - return __page_memcg(page); + return folio_memcg(page_folio(page)); } /* @@ -523,17 +527,18 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) #ifdef CONFIG_MEMCG_KMEM /* - * PageMemcgKmem - check if the page has MemcgKmem flag set - * @page: a pointer to the page struct + * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. + * @folio: Pointer to the folio. * - * Checks if the page has MemcgKmem flag set. The caller must ensure that - * the page has an associated memory cgroup. It's not safe to call this function - * against some types of pages, e.g. slab pages. + * Checks if the folio has MemcgKmem flag set. The caller must ensure + * that the folio has an associated memory cgroup. It's not safe to call + * this function against some types of folios, e.g. slab folios. */ -static inline bool PageMemcgKmem(struct page *page) +static inline bool folio_memcg_kmem(struct folio *folio) { - VM_BUG_ON_PAGE(page->memcg_data & MEMCG_DATA_OBJCGS, page); - return page->memcg_data & MEMCG_DATA_KMEM; + VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page); + VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio); + return folio->memcg_data & MEMCG_DATA_KMEM; } /* @@ -577,7 +582,7 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #else -static inline bool PageMemcgKmem(struct page *page) +static inline bool folio_memcg_kmem(struct folio *folio) { return false; } @@ -593,6 +598,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #endif +static inline bool PageMemcgKmem(struct page *page) +{ + return folio_memcg_kmem(page_folio(page)); +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -1115,6 +1125,11 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, #define MEM_CGROUP_ID_SHIFT 0 #define MEM_CGROUP_ID_MAX 0 +static inline struct mem_cgroup *folio_memcg(struct folio *folio) +{ + return NULL; +} + static inline struct mem_cgroup *page_memcg(struct page *page) { return NULL; @@ -1131,6 +1146,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; } +static inline bool folio_memcg_kmem(struct folio *folio) +{ + return false; +} + static inline bool PageMemcgKmem(struct page *page) { return false; -- cgit v1.2.3 From 8f425e4ed0eb3ef0b2d85a9efccf947ca6aa9b1c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 25 Jun 2021 09:27:04 -0400 Subject: mm/memcg: Convert mem_cgroup_charge() to take a folio Convert all callers of mem_cgroup_charge() to call page_folio() on the page they're currently passing in. Many of them will be converted to use folios themselves soon. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/memcontrol.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 06659670db32..19a51729e00c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -694,14 +694,28 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) page_counter_read(&memcg->memory); } -int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask); -static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) +int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); + +/** + * mem_cgroup_charge - Charge a newly allocated folio to a cgroup. + * @folio: Folio to charge. + * @mm: mm context of the allocating task. + * @gfp: Reclaim mode. + * + * Try to charge @folio to the memcg that @mm belongs to, reclaiming + * pages according to @gfp if necessary. If @mm is NULL, try to + * charge to the active memcg. + * + * Do not use this for folios allocated for swapin. + * + * Return: 0 on success. Otherwise, an error code is returned. + */ +static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, + gfp_t gfp) { if (mem_cgroup_disabled()) return 0; - return __mem_cgroup_charge(page, mm, gfp_mask); + return __mem_cgroup_charge(folio, mm, gfp); } int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, @@ -1199,8 +1213,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) return false; } -static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) +static inline int mem_cgroup_charge(struct folio *folio, + struct mm_struct *mm, gfp_t gfp) { return 0; } -- cgit v1.2.3 From bbc6b703b21963e909f633cf7718903ed5094319 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 1 May 2021 20:42:23 -0400 Subject: mm/memcg: Convert mem_cgroup_uncharge() to take a folio Convert all the callers to call page_folio(). Most of them were already using a head page, but a few of them I can't prove were, so this may actually fix a bug. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Mike Rapoport Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/memcontrol.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 19a51729e00c..b4bc052db32b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -722,12 +722,19 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); -void __mem_cgroup_uncharge(struct page *page); -static inline void mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct folio *folio); + +/** + * mem_cgroup_uncharge - Uncharge a folio. + * @folio: Folio to uncharge. + * + * Uncharge a folio previously charged with mem_cgroup_charge(). + */ +static inline void mem_cgroup_uncharge(struct folio *folio) { if (mem_cgroup_disabled()) return; - __mem_cgroup_uncharge(page); + __mem_cgroup_uncharge(folio); } void __mem_cgroup_uncharge_list(struct list_head *page_list); @@ -1229,7 +1236,7 @@ static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) { } -static inline void mem_cgroup_uncharge(struct page *page) +static inline void mem_cgroup_uncharge(struct folio *folio) { } -- cgit v1.2.3 From d21bba2b7d0ae19dd1279e10aee61c37a17aba74 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 May 2021 18:14:59 -0400 Subject: mm/memcg: Convert mem_cgroup_migrate() to take folios Convert all callers of mem_cgroup_migrate() to call page_folio() first. They all look like they're using head pages already, but this proves it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Mike Rapoport Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/memcontrol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b4bc052db32b..07eda24ec581 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -745,7 +745,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) __mem_cgroup_uncharge_list(page_list); } -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); +void mem_cgroup_migrate(struct folio *old, struct folio *new); /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node @@ -1244,7 +1244,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } -static inline void mem_cgroup_migrate(struct page *old, struct page *new) +static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } -- cgit v1.2.3 From 9d8053fc7a21ee2b3a540165d09418955258d9e8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 11:43:01 -0400 Subject: mm/memcg: Convert mem_cgroup_track_foreign_dirty_slowpath() to folio The page was only being used for the memcg and to gather trace information, so this is a simple conversion. The only caller of mem_cgroup_track_foreign_dirty() will be converted to folios in a later patch, so doing this now makes that patch simpler. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/memcontrol.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 07eda24ec581..1c2776c3a223 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1599,17 +1599,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback); -void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, +void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb); static inline void mem_cgroup_track_foreign_dirty(struct page *page, struct bdi_writeback *wb) { + struct folio *folio = page_folio(page); if (mem_cgroup_disabled()) return; - if (unlikely(&page_memcg(page)->css != wb->memcg_css)) - mem_cgroup_track_foreign_dirty_slowpath(page, wb); + if (unlikely(&folio_memcg(folio)->css != wb->memcg_css)) + mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); -- cgit v1.2.3 From f70ad448741580bf61cdfbeb02229c581409760a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 17:26:00 -0400 Subject: mm/memcg: Add folio_memcg_lock() and folio_memcg_unlock() These are the folio equivalents of lock_page_memcg() and unlock_page_memcg(). lock_page_memcg() and unlock_page_memcg() have too many callers to be easily replaced in a single patch, so reimplement them as wrappers for now to be cleaned up later when enough callers have been converted to use folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Mike Rapoport Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/memcontrol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1c2776c3a223..be85450f066f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -978,6 +978,8 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); extern bool cgroup_memory_noswap; #endif +void folio_memcg_lock(struct folio *folio); +void folio_memcg_unlock(struct folio *folio); void lock_page_memcg(struct page *page); void unlock_page_memcg(struct page *page); @@ -1397,6 +1399,14 @@ static inline void unlock_page_memcg(struct page *page) { } +static inline void folio_memcg_lock(struct folio *folio) +{ +} + +static inline void folio_memcg_unlock(struct folio *folio) +{ +} + static inline void mem_cgroup_handle_over_high(void) { } -- cgit v1.2.3 From b1baabd995ab8e830dbf647fe731b51e12b8cedd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 20:00:28 -0400 Subject: mm/memcg: Add folio_lruvec() This replaces mem_cgroup_page_lruvec(). All callers converted. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Mike Rapoport Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/memcontrol.h | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index be85450f066f..35577caf27d9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -784,18 +784,17 @@ out: } /** - * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page - * @page: the page + * folio_lruvec - return lruvec for isolating/putting an LRU folio + * @folio: Pointer to the folio. * - * This function relies on page->mem_cgroup being stable. + * This function relies on folio->mem_cgroup being stable. */ -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page) +static inline struct lruvec *folio_lruvec(struct folio *folio) { - pg_data_t *pgdat = page_pgdat(page); - struct mem_cgroup *memcg = page_memcg(page); + struct mem_cgroup *memcg = folio_memcg(folio); - VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page); - return mem_cgroup_lruvec(memcg, pgdat); + VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio); + return mem_cgroup_lruvec(memcg, folio_pgdat(folio)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); @@ -1256,10 +1255,9 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, return &pgdat->__lruvec; } -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page) +static inline struct lruvec *folio_lruvec(struct folio *folio) { - pg_data_t *pgdat = page_pgdat(page); - + struct pglist_data *pgdat = folio_pgdat(folio); return &pgdat->__lruvec; } -- cgit v1.2.3 From e809c3fedeeb806993349e7bf797b4c2b728be7d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 21:59:47 -0400 Subject: mm/memcg: Add folio_lruvec_lock() and similar functions These are the folio equivalents of lock_page_lruvec() and similar functions. Also convert lruvec_memcg_debug() to take a folio. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/memcontrol.h | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 35577caf27d9..30d2cd7b5c9e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -801,15 +801,16 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); -struct lruvec *lock_page_lruvec(struct page *page); -struct lruvec *lock_page_lruvec_irq(struct page *page); -struct lruvec *lock_page_lruvec_irqsave(struct page *page, +struct lruvec *folio_lruvec_lock(struct folio *folio); +struct lruvec *folio_lruvec_lock_irq(struct folio *folio); +struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); #ifdef CONFIG_DEBUG_VM -void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page); +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); #else -static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +static inline +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } #endif @@ -1261,7 +1262,8 @@ static inline struct lruvec *folio_lruvec(struct folio *folio) return &pgdat->__lruvec; } -static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +static inline +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } @@ -1291,26 +1293,26 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } -static inline struct lruvec *lock_page_lruvec(struct page *page) +static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { - struct pglist_data *pgdat = page_pgdat(page); + struct pglist_data *pgdat = folio_pgdat(folio); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } -static inline struct lruvec *lock_page_lruvec_irq(struct page *page) +static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { - struct pglist_data *pgdat = page_pgdat(page); + struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } -static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page, +static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flagsp) { - struct pglist_data *pgdat = page_pgdat(page); + struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; @@ -1576,6 +1578,7 @@ static inline bool page_matches_lruvec(struct page *page, struct lruvec *lruvec) static inline struct lruvec *relock_page_lruvec_irq(struct page *page, struct lruvec *locked_lruvec) { + struct folio *folio = page_folio(page); if (locked_lruvec) { if (page_matches_lruvec(page, locked_lruvec)) return locked_lruvec; @@ -1583,13 +1586,14 @@ static inline struct lruvec *relock_page_lruvec_irq(struct page *page, unlock_page_lruvec_irq(locked_lruvec); } - return lock_page_lruvec_irq(page); + return folio_lruvec_lock_irq(folio); } /* Don't lock again iff page's lruvec locked */ static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page, struct lruvec *locked_lruvec, unsigned long *flags) { + struct folio *folio = page_folio(page); if (locked_lruvec) { if (page_matches_lruvec(page, locked_lruvec)) return locked_lruvec; @@ -1597,7 +1601,7 @@ static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page, unlock_page_lruvec_irqrestore(locked_lruvec, *flags); } - return lock_page_lruvec_irqsave(page, flags); + return folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK -- cgit v1.2.3 From 0de340cbed3359423e38ed49242ac9d6986b5cfd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 29 Jun 2021 22:27:31 -0400 Subject: mm/memcg: Add folio_lruvec_relock_irq() and folio_lruvec_relock_irqsave() These are the folio equivalents of relock_page_lruvec_irq() and folio_lruvec_relock_irqsave(). Also convert page_matches_lruvec() to folio_matches_lruvec(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/memcontrol.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 30d2cd7b5c9e..05094eaf1d61 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1568,19 +1568,19 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, } /* Test requires a stable page->memcg binding, see page_memcg() */ -static inline bool page_matches_lruvec(struct page *page, struct lruvec *lruvec) +static inline bool folio_matches_lruvec(struct folio *folio, + struct lruvec *lruvec) { - return lruvec_pgdat(lruvec) == page_pgdat(page) && - lruvec_memcg(lruvec) == page_memcg(page); + return lruvec_pgdat(lruvec) == folio_pgdat(folio) && + lruvec_memcg(lruvec) == folio_memcg(folio); } /* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *relock_page_lruvec_irq(struct page *page, +static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, struct lruvec *locked_lruvec) { - struct folio *folio = page_folio(page); if (locked_lruvec) { - if (page_matches_lruvec(page, locked_lruvec)) + if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irq(locked_lruvec); @@ -1590,12 +1590,11 @@ static inline struct lruvec *relock_page_lruvec_irq(struct page *page, } /* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page, +static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio, struct lruvec *locked_lruvec, unsigned long *flags) { - struct folio *folio = page_folio(page); if (locked_lruvec) { - if (page_matches_lruvec(page, locked_lruvec)) + if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irqrestore(locked_lruvec, *flags); -- cgit v1.2.3 From c5ce619a77ce00d537ef512e7a823c99ce890a40 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 17:19:13 -0400 Subject: mm/workingset: Convert workingset_activation to take a folio This function already assumed it was being passed a head page. No real change here, except that thp_nr_pages() compiles away on kernels with THP compiled out while folio_nr_pages() is always present. Also convert page_memcg_rcu() to folio_memcg_rcu(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/memcontrol.h | 22 ++++++++++++---------- include/linux/swap.h | 2 +- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 05094eaf1d61..7bd78c13d1fa 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -457,20 +457,22 @@ static inline struct mem_cgroup *page_memcg(struct page *page) return folio_memcg(page_folio(page)); } -/* - * page_memcg_rcu - locklessly get the memory cgroup associated with a page - * @page: a pointer to the page struct +/** + * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. + * @folio: Pointer to the folio. * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function assumes that the page is known to have a + * This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function - * against some type of pages, e.g. slab pages or ex-slab pages. + * against some type of folios, e.g. slab folios or ex-slab folios. + * + * Return: A pointer to the memory cgroup associated with the folio, + * or NULL. */ -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { - unsigned long memcg_data = READ_ONCE(page->memcg_data); + unsigned long memcg_data = READ_ONCE(folio->memcg_data); - VM_BUG_ON_PAGE(PageSlab(page), page); + VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); WARN_ON_ONCE(!rcu_read_lock_held()); if (memcg_data & MEMCG_DATA_KMEM) { @@ -1158,7 +1160,7 @@ static inline struct mem_cgroup *page_memcg(struct page *page) return NULL; } -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { WARN_ON_ONCE(!rcu_read_lock_held()); return NULL; diff --git a/include/linux/swap.h b/include/linux/swap.h index c7ecd3ad8e2e..0fc84797623f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -330,7 +330,7 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio) void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); void workingset_refault(struct page *page, void *shadow); -void workingset_activation(struct page *page); +void workingset_activation(struct folio *folio); /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); -- cgit v1.2.3 From bf6bd276b374d44f6e7146d52aa6097eb91384a3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 7 May 2021 10:55:27 -0400 Subject: mm: Add folio_pfn() This is the folio equivalent of page_to_pfn(). Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/mm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 04e41d4d85ea..47143f3e7f0a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1624,6 +1624,20 @@ static inline unsigned long page_to_section(const struct page *page) } #endif +/** + * folio_pfn - Return the Page Frame Number of a folio. + * @folio: The folio. + * + * A folio may contain multiple pages. The pages have consecutive + * Page Frame Numbers. + * + * Return: The Page Frame Number of the first page in the folio. + */ +static inline unsigned long folio_pfn(struct folio *folio) +{ + return page_to_pfn(&folio->page); +} + /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) -- cgit v1.2.3 From 2a5a8fa8b23144d14567d6f8293dd6fbeecee393 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Sep 2021 18:16:01 +0200 Subject: leds: trigger: use RCU to protect the led_cdevs list Even with the previous commit 27af8e2c90fb ("leds: trigger: fix potential deadlock with libata") to this file, we still get lockdep unhappy, and Boqun explained the report here: https://lore.kernel.org/r/YNA+d1X4UkoQ7g8a@boqun-archlinux Effectively, this means that the read_lock_irqsave() isn't enough here because another CPU might be trying to do a write lock, and thus block the readers. This is all pretty messy, but it doesn't seem right that the LEDs framework imposes some locking requirements on users, in particular we'd have to make the spinlock in the iwlwifi driver always disable IRQs, even if we don't need that for any other reason, just to avoid this deadlock. Since writes to the led_cdevs list are rare (and are done by userspace), just switch the list to RCU. This costs a synchronize_rcu() at removal time so we can ensure things are correct, but that seems like a small price to pay for getting lock-free iterations and no deadlocks (nor any locking requirements imposed on users.) Signed-off-by: Johannes Berg Signed-off-by: Pavel Machek --- include/linux/leds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index a0b730be40ad..ba4861ec73d3 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -360,7 +360,7 @@ struct led_trigger { struct led_hw_trigger_type *trigger_type; /* LEDs under control by this trigger (for simple triggers) */ - rwlock_t leddev_list_lock; + spinlock_t leddev_list_lock; struct list_head led_cdevs; /* Link to next registered trigger */ -- cgit v1.2.3 From 4795448117824cc4900d696f17d1516c56371045 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 18 Sep 2021 14:53:44 +0200 Subject: PCI: ACPI: Drop acpi_pci_bus The acpi_pci_bus structure was used primarily for running acpi_pci_find_companion() during PCI device objects registration, but after commit 375553a93201 ("PCI: Setup ACPI fwnode early and at the same time with OF") that function is called by pci_setup_device() via pci_set_acpi_fwnode(), which happens before calling pci_device_add() on the new PCI device object, so its ACPI companion has been set already when acpi_device_notify() runs and it will never call ->find_companion() from acpi_pci_bus. For this reason, modify acpi_device_notify() and acpi_device_notify_remove() to call pci_acpi_setup() and pci_acpi_cleanup(), respectively, directly on PCI device objects and drop acpi_pci_bus altogether. While at it, notice that pci_acpi_setup() and pci_acpi_cleanup() can obtain the ACPI companion pointer, which is guaranteed to not be NULL, from their callers and modify them to work that way so as to reduce the number of redundant checks somewhat. Signed-off-by: Rafael J. Wysocki Tested-by: Ferry Toth --- include/linux/pci-acpi.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index f16de399d2de..078225b514d4 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -84,6 +84,14 @@ extern struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root, void acpi_pci_add_bus(struct pci_bus *bus); void acpi_pci_remove_bus(struct pci_bus *bus); +#ifdef CONFIG_PCI +void pci_acpi_setup(struct device *dev, struct acpi_device *adev); +void pci_acpi_cleanup(struct device *dev, struct acpi_device *adev); +#else +static inline void pci_acpi_setup(struct device *dev, struct acpi_device *adev) {} +static inline void pci_acpi_cleanup(struct device *dev, struct acpi_device *adev) {} +#endif + #ifdef CONFIG_ACPI_PCI_SLOT void acpi_pci_slot_init(void); void acpi_pci_slot_enumerate(struct pci_bus *bus); -- cgit v1.2.3 From e765f13ed126fe7e41d1a6e3c60d754cd6c2af93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 22 Sep 2021 19:34:30 +0200 Subject: nvdimm/pmem: move dax_attribute_group from dax to pmem dax_attribute_group is only used by the pmem driver, and can avoid the completely pointless lookup by the disk name if moved there. This leaves just a single caller of dax_get_by_host, so move dax_get_by_host into the same ifdef block as that caller. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20210922173431.2454024-3-hch@lst.de Signed-off-by: Dan Williams --- include/linux/dax.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 2619d94c308d..8623caa67388 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -38,8 +38,6 @@ struct dax_operations { int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); }; -extern struct attribute_group dax_attribute_group; - #if IS_ENABLED(CONFIG_DAX) struct dax_device *alloc_dax(void *private, const char *host, const struct dax_operations *ops, unsigned long flags); -- cgit v1.2.3 From 7b4d7894c65bd1e83b2f021d7944a46bdd64836a Mon Sep 17 00:00:00 2001 From: Deepak Kumar Singh Date: Tue, 31 Aug 2021 20:00:27 +0530 Subject: soc: qcom: aoss: Expose send for generic usecase Not all upcoming usecases will have an interface to allow the aoss driver to hook onto. Expose the send api and create a get function to enable drivers to send their own messages to aoss. Signed-off-by: Chris Lew Signed-off-by: Deepak Kumar Singh Reviewed-by: Stephen Boyd Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/1630420228-31075-2-git-send-email-deesin@codeaurora.org --- include/linux/soc/qcom/qcom_aoss.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 include/linux/soc/qcom/qcom_aoss.h (limited to 'include/linux') diff --git a/include/linux/soc/qcom/qcom_aoss.h b/include/linux/soc/qcom/qcom_aoss.h new file mode 100644 index 000000000000..3c2a82e606f8 --- /dev/null +++ b/include/linux/soc/qcom/qcom_aoss.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#ifndef __QCOM_AOSS_H__ +#define __QCOM_AOSS_H__ + +#include +#include + +struct qmp; + +#if IS_ENABLED(CONFIG_QCOM_AOSS_QMP) + +int qmp_send(struct qmp *qmp, const void *data, size_t len); +struct qmp *qmp_get(struct device *dev); +void qmp_put(struct qmp *qmp); + +#else + +static inline int qmp_send(struct qmp *qmp, const void *data, size_t len) +{ + return -ENODEV; +} + +static inline struct qmp *qmp_get(struct device *dev) +{ + return ERR_PTR(-ENODEV); +} + +static inline void qmp_put(struct qmp *qmp) +{ +} + +#endif + +#endif -- cgit v1.2.3 From 99139b80c1b3d73026ed8be2de42c52e2976ab64 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Mon, 27 Sep 2021 14:55:40 +0100 Subject: soc: qcom: apr: make code more reuseable APR and other packet routers like GPR are pretty much same and interact with other drivers in similar way. Ex: GPR ports can be considered as APR services, only difference is they are allocated dynamically. Other difference is packet layout, which should not matter with the apis abstracted. Apart from this the rest of the functionality is pretty much identical across APR and GPR. Make the apr code more reusable by abstracting it service level, rather than device level so that we do not need to write new drivers for other new packet routers like GPR. This patch is in preparation to add GPR support to this driver. Signed-off-by: Srinivas Kandagatla Reviewed-by: Pierre-Louis Bossart Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20210927135559.738-4-srinivas.kandagatla@linaro.org --- include/linux/soc/qcom/apr.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/apr.h b/include/linux/soc/qcom/apr.h index 137f9f2ac4c3..7bca213a3f83 100644 --- a/include/linux/soc/qcom/apr.h +++ b/include/linux/soc/qcom/apr.h @@ -79,6 +79,15 @@ struct apr_resp_pkt { #define APR_SVC_MAJOR_VERSION(v) ((v >> 16) & 0xFF) #define APR_SVC_MINOR_VERSION(v) (v & 0xFF) +struct packet_router; +struct pkt_router_svc { + struct device *dev; + struct packet_router *pr; + spinlock_t lock; + int id; + void *priv; +}; + struct apr_device { struct device dev; uint16_t svc_id; @@ -86,11 +95,12 @@ struct apr_device { uint32_t version; char name[APR_NAME_SIZE]; const char *service_path; - spinlock_t lock; + struct pkt_router_svc svc; struct list_head node; }; #define to_apr_device(d) container_of(d, struct apr_device, dev) +#define svc_to_apr_device(d) container_of(d, struct apr_device, svc) struct apr_driver { int (*probe)(struct apr_device *sl); -- cgit v1.2.3 From ec1471a898cca38af6b8956a83ebc1297214546f Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Mon, 27 Sep 2021 14:55:42 +0100 Subject: soc: qcom: apr: Add GPR support Qualcomm Generic Packet router aka GPR is the IPC mechanism found in AudioReach next generation signal processing framework to perform command and response messages between various processors. GPR has concepts of static and dynamic port, all static services like APM (Audio Processing Manager), PRM (Proxy resource manager) have fixed port numbers where as dynamic services like graphs have dynamic port numbers which are allocated at runtime. All GPR packet messages will have source and destination domain and port along with opcode and payload. Signed-off-by: Srinivas Kandagatla Reviewed-by: Pierre-Louis Bossart Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20210927135559.738-6-srinivas.kandagatla@linaro.org --- include/linux/soc/qcom/apr.h | 58 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/apr.h b/include/linux/soc/qcom/apr.h index 7bca213a3f83..23c5b30f3511 100644 --- a/include/linux/soc/qcom/apr.h +++ b/include/linux/soc/qcom/apr.h @@ -7,6 +7,7 @@ #include #include #include +#include extern struct bus_type aprbus; @@ -75,19 +76,65 @@ struct apr_resp_pkt { int payload_size; }; +struct gpr_hdr { + uint32_t version:4; + uint32_t hdr_size:4; + uint32_t pkt_size:24; + uint32_t dest_domain:8; + uint32_t src_domain:8; + uint32_t reserved:16; + uint32_t src_port; + uint32_t dest_port; + uint32_t token; + uint32_t opcode; +} __packed; + +struct gpr_pkt { + struct gpr_hdr hdr; + uint32_t payload[]; +}; + +struct gpr_resp_pkt { + struct gpr_hdr hdr; + void *payload; + int payload_size; +}; + +#define GPR_HDR_SIZE sizeof(struct gpr_hdr) +#define GPR_PKT_VER 0x0 +#define GPR_PKT_HEADER_WORD_SIZE ((sizeof(struct gpr_pkt) + 3) >> 2) +#define GPR_PKT_HEADER_BYTE_SIZE (GPR_PKT_HEADER_WORD_SIZE << 2) + +#define GPR_BASIC_RSP_RESULT 0x02001005 + +struct gpr_ibasic_rsp_result_t { + uint32_t opcode; + uint32_t status; +}; + +#define GPR_BASIC_EVT_ACCEPTED 0x02001006 + +struct gpr_ibasic_rsp_accepted_t { + uint32_t opcode; +}; + /* Bits 0 to 15 -- Minor version, Bits 16 to 31 -- Major version */ #define APR_SVC_MAJOR_VERSION(v) ((v >> 16) & 0xFF) #define APR_SVC_MINOR_VERSION(v) (v & 0xFF) +typedef int (*gpr_port_cb) (struct gpr_resp_pkt *d, void *priv, int op); struct packet_router; struct pkt_router_svc { struct device *dev; + gpr_port_cb callback; struct packet_router *pr; spinlock_t lock; int id; void *priv; }; +typedef struct pkt_router_svc gpr_port_t; + struct apr_device { struct device dev; uint16_t svc_id; @@ -99,6 +146,8 @@ struct apr_device { struct list_head node; }; +typedef struct apr_device gpr_device_t; + #define to_apr_device(d) container_of(d, struct apr_device, dev) #define svc_to_apr_device(d) container_of(d, struct apr_device, svc) @@ -107,10 +156,12 @@ struct apr_driver { int (*remove)(struct apr_device *sl); int (*callback)(struct apr_device *a, struct apr_resp_pkt *d); + int (*gpr_callback)(struct gpr_resp_pkt *d, void *data, int op); struct device_driver driver; const struct apr_device_id *id_table; }; +typedef struct apr_driver gpr_driver_t; #define to_apr_driver(d) container_of(d, struct apr_driver, driver) /* @@ -133,7 +184,14 @@ void apr_driver_unregister(struct apr_driver *drv); #define module_apr_driver(__apr_driver) \ module_driver(__apr_driver, apr_driver_register, \ apr_driver_unregister) +#define module_gpr_driver(__gpr_driver) module_apr_driver(__gpr_driver) int apr_send_pkt(struct apr_device *adev, struct apr_pkt *pkt); +gpr_port_t *gpr_alloc_port(gpr_device_t *gdev, struct device *dev, + gpr_port_cb cb, void *priv); +void gpr_free_port(gpr_port_t *port); +int gpr_send_port_pkt(gpr_port_t *port, struct gpr_pkt *pkt); +int gpr_send_pkt(gpr_device_t *gdev, struct gpr_pkt *pkt); + #endif /* __QCOM_APR_H_ */ -- cgit v1.2.3 From af3826db74d184bc9c2c9d3ff34548e5f317a6f3 Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Tue, 28 Sep 2021 11:25:26 +0530 Subject: octeontx2-pf: Use hardware register for CQE count Current driver uses software CQ head pointer to poll on CQE header in memory to determine if CQE is valid. Software needs to make sure, that the reads of the CQE do not get re-ordered so much that it ends up with an inconsistent view of the CQE. To ensure that DMB barrier after read to first CQE cacheline and before reading of the rest of the CQE is needed. But having barrier for every CQE read will impact the performance, instead use hardware CQ head and tail pointers to find the valid number of CQEs. Signed-off-by: Geetha sowjanya Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: David S. Miller --- include/linux/soc/marvell/octeontx2/asm.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/marvell/octeontx2/asm.h b/include/linux/soc/marvell/octeontx2/asm.h index fa1d6af0164e..0f79fd7f81a1 100644 --- a/include/linux/soc/marvell/octeontx2/asm.h +++ b/include/linux/soc/marvell/octeontx2/asm.h @@ -34,9 +34,23 @@ : [rf] "+r"(val) \ : [rs] "r"(addr)); \ }) + +static inline u64 otx2_atomic64_fetch_add(u64 incr, u64 *ptr) +{ + u64 result; + + asm volatile (".cpu generic+lse\n" + "ldadda %x[i], %x[r], [%[b]]" + : [r] "=r" (result), "+m" (*ptr) + : [i] "r" (incr), [b] "r" (ptr) + : "memory"); + return result; +} + #else #define otx2_lmt_flush(ioaddr) ({ 0; }) #define cn10k_lmt_flush(val, addr) ({ addr = val; }) +#define otx2_atomic64_fetch_add(incr, ptr) ({ incr; }) #endif #endif /* __SOC_OTX2_ASM_H */ -- cgit v1.2.3 From 62b8963cd84df1fc04986cd9b27586acce758f36 Mon Sep 17 00:00:00 2001 From: Pradeep Kumar Chitrapu Date: Tue, 28 Sep 2021 14:00:45 +0300 Subject: ieee80211: Add new A-MPDU factor macro for HE 6 GHz peer caps Add IEEE80211_HE_6GHZ_MAX_AMPDU_FACTOR as per IEEE Std 802.11ax-2021, 9.4.2.263 to use for peer max A-MPDU factor in 6 GHz band. Signed-off-by: Pradeep Kumar Chitrapu Signed-off-by: Jouni Malinen Acked-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210913175510.193005-1-jouni@codeaurora.org --- include/linux/ieee80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 694264503119..a1a7eda35cb5 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2084,6 +2084,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, #define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR 20 #define IEEE80211_HE_HT_MAX_AMPDU_FACTOR 16 +#define IEEE80211_HE_6GHZ_MAX_AMPDU_FACTOR 13 /* 802.11ax HE PHY capabilities */ #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G 0x02 -- cgit v1.2.3 From 8de1e9b01b03d2805f58aa490ea5bf0df003bc80 Mon Sep 17 00:00:00 2001 From: Meir Lichtinger Date: Wed, 22 Sep 2021 11:28:50 +0300 Subject: net/mlx5: Add uid field to UAR allocation structures Add uid field to mlx5_ifc_alloc_uar_in_bits and mlx5_ifc_dealloc_uar_out_bits structs. This field will be used by FW to manage UAR according to uid. Signed-off-by: Meir Lichtinger Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f3638d09ba77..96f5fb2af811 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -7569,7 +7569,7 @@ struct mlx5_ifc_dealloc_uar_out_bits { struct mlx5_ifc_dealloc_uar_in_bits { u8 opcode[0x10]; - u8 reserved_at_10[0x10]; + u8 uid[0x10]; u8 reserved_at_20[0x10]; u8 op_mod[0x10]; @@ -8416,7 +8416,7 @@ struct mlx5_ifc_alloc_uar_out_bits { struct mlx5_ifc_alloc_uar_in_bits { u8 opcode[0x10]; - u8 reserved_at_10[0x10]; + u8 uid[0x10]; u8 reserved_at_20[0x10]; u8 op_mod[0x10]; -- cgit v1.2.3 From d2c8a1554c10d5e0443b1f97f480d7dacd55cf55 Mon Sep 17 00:00:00 2001 From: Meir Lichtinger Date: Wed, 22 Sep 2021 11:28:51 +0300 Subject: IB/mlx5: Enable UAR to have DevX UID UID field was added to alloc_uar and dealloc_uar PRM command, to specify DevX UID for UAR. This change enables firmware validating user access to its own UAR resources. For the kernel allocated UARs the UID will stay 0 as of today. Signed-off-by: Meir Lichtinger Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e23417424373..1b8bae246b28 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1005,8 +1005,6 @@ void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome); bool mlx5_cmd_is_down(struct mlx5_core_dev *dev); int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); -int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn); -int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn); void mlx5_health_flush(struct mlx5_core_dev *dev); void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); -- cgit v1.2.3 From af9d82626c8f8547910e3e22c76ced3f5d5f5286 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Aug 2021 14:20:51 +0200 Subject: PCI/ACPI: Remove OSC_PCI_SUPPORT_MASKS and OSC_PCI_CONTROL_MASKS These masks are only used internally in the PCI Host Bridge _OSC negotiation code, which already makes sure nothing outside of these masks is set. Remove the masks and simplify the code. Suggested-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20210824122054.29481-2-joro@8bytes.org Signed-off-by: Joerg Roedel Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki --- include/linux/acpi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 974d497a897d..eb59fd3052c0 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -577,7 +577,6 @@ extern u32 osc_sb_native_usb4_control; #define OSC_PCI_MSI_SUPPORT 0x00000010 #define OSC_PCI_EDR_SUPPORT 0x00000080 #define OSC_PCI_HPX_TYPE_3_SUPPORT 0x00000100 -#define OSC_PCI_SUPPORT_MASKS 0x0000019f /* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */ #define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL 0x00000001 @@ -587,7 +586,6 @@ extern u32 osc_sb_native_usb4_control; #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL 0x00000010 #define OSC_PCI_EXPRESS_LTR_CONTROL 0x00000020 #define OSC_PCI_EXPRESS_DPC_CONTROL 0x00000080 -#define OSC_PCI_CONTROL_MASKS 0x000000bf #define ACPI_GSB_ACCESS_ATTRIB_QUICK 0x00000002 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV 0x00000004 -- cgit v1.2.3 From fec2432c9a7370788faab416b38589ac9f4350e5 Mon Sep 17 00:00:00 2001 From: Diana Craciun Date: Wed, 22 Sep 2021 14:05:29 +0300 Subject: bus/fsl-mc: Add generic implementation for open/reset/close commands The open/reset/close commands format is similar for all objects. Currently there are multiple implementations for these commands scattered through various drivers. The code is cavsi-identical. Create a generic implementation for the open/reset/close commands. One of the consumer will be the VFIO driver which needs to be able to reset a device. Signed-off-by: Diana Craciun Reviewed-by: Laurentiu Tudor Link: https://lore.kernel.org/r/20210922110530.24736-1-diana.craciun@oss.nxp.com Signed-off-by: Alex Williamson --- include/linux/fsl/mc.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 30ece3ae6df7..e026f6c48b49 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -620,6 +620,20 @@ int dpcon_reset(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token); +int fsl_mc_obj_open(struct fsl_mc_io *mc_io, + u32 cmd_flags, + int obj_id, + char *obj_type, + u16 *token); + +int fsl_mc_obj_close(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token); + +int fsl_mc_obj_reset(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token); + /** * struct dpcon_attr - Structure representing DPCON attributes * @id: DPCON object ID -- cgit v1.2.3 From 3d717fad5081b8e3bda76d86907fad95398cbde8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 28 Sep 2021 16:09:45 -0700 Subject: bpf: Replace "want address" users of BPF_CAST_CALL with BPF_CALL_IMM In order to keep ahead of cases in the kernel where Control Flow Integrity (CFI) may trip over function call casts, enabling -Wcast-function-type is helpful. To that end, BPF_CAST_CALL causes various warnings and is one of the last places in the kernel triggering this warning. Most places using BPF_CAST_CALL actually just want a void * to perform math on. It's not actually performing a call, so just use a different helper to get the void *, by way of the new BPF_CALL_IMM() helper, which can clean up a common copy/paste idiom as well. This change results in no object code difference. Signed-off-by: Kees Cook Signed-off-by: Alexei Starovoitov Reviewed-by: Gustavo A. R. Silva Acked-by: Andrii Nakryiko Link: https://github.com/KSPP/linux/issues/20 Link: https://lore.kernel.org/lkml/CAEf4Bzb46=-J5Fxc3mMZ8JQPtK1uoE0q6+g6WPz53Cvx=CBEhw@mail.gmail.com Link: https://lore.kernel.org/bpf/20210928230946.4062144-2-keescook@chromium.org --- include/linux/filter.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4a93c12543ee..6c247663d4ce 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -365,13 +365,17 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) #define BPF_CAST_CALL(x) \ ((u64 (*)(u64, u64, u64, u64, u64))(x)) +/* Convert function address to BPF immediate */ + +#define BPF_CALL_IMM(x) ((void *)(x) - (void *)__bpf_call_base) + #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ - .imm = ((FUNC) - __bpf_call_base) }) + .imm = BPF_CALL_IMM(FUNC) }) /* Raw code statement block */ -- cgit v1.2.3 From 102acbacfd9a96d101abd96d1a7a5bf92b7c3e8e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 28 Sep 2021 16:09:46 -0700 Subject: bpf: Replace callers of BPF_CAST_CALL with proper function typedef In order to keep ahead of cases in the kernel where Control Flow Integrity (CFI) may trip over function call casts, enabling -Wcast-function-type is helpful. To that end, BPF_CAST_CALL causes various warnings and is one of the last places in the kernel triggering this warning. For actual function calls, replace BPF_CAST_CALL() with a typedef, which captures the same details about the given function pointers. This change results in no object code difference. Signed-off-by: Kees Cook Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Gustavo A. R. Silva Link: https://github.com/KSPP/linux/issues/20 Link: https://lore.kernel.org/lkml/CAEf4Bzb46=-J5Fxc3mMZ8JQPtK1uoE0q6+g6WPz53Cvx=CBEhw@mail.gmail.com Link: https://lore.kernel.org/bpf/20210928230946.4062144-3-keescook@chromium.org --- include/linux/bpf.h | 4 +++- include/linux/filter.h | 5 ----- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b6c45a6cbbba..19735d59230a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -48,6 +48,7 @@ extern struct idr btf_idr; extern spinlock_t btf_idr_lock; extern struct kobject *btf_kobj; +typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data); @@ -142,7 +143,8 @@ struct bpf_map_ops { int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); - int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn, + int (*map_for_each_callback)(struct bpf_map *map, + bpf_callback_t callback_fn, void *callback_ctx, u64 flags); /* BTF name and id of struct allocated by map_alloc */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 6c247663d4ce..47f80adbe744 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -360,11 +360,6 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = 0, \ .imm = TGT }) -/* Function call */ - -#define BPF_CAST_CALL(x) \ - ((u64 (*)(u64, u64, u64, u64, u64))(x)) - /* Convert function address to BPF immediate */ #define BPF_CALL_IMM(x) ((void *)(x) - (void *)__bpf_call_base) -- cgit v1.2.3 From 7c2dcfa295b149a58010632c7eb7e73bd0626a7a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 28 Sep 2021 20:45:19 +0200 Subject: net: phy: micrel: Add support for LAN8804 PHY The LAN8804 PHY has same features as that of LAN8814 PHY except that it doesn't support 1588, SyncE or Q-USGMII. This PHY is found inside the LAN966X switches. Reviewed-by: Andrew Lunn Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- include/linux/micrel_phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index 3d43c60b49fa..1f7c33b2f5a3 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -28,6 +28,7 @@ #define PHY_ID_KSZ9031 0x00221620 #define PHY_ID_KSZ9131 0x00221640 #define PHY_ID_LAN8814 0x00221660 +#define PHY_ID_LAN8804 0x00221670 #define PHY_ID_KSZ886X 0x00221430 #define PHY_ID_KSZ8863 0x00221435 -- cgit v1.2.3 From e81e99bacc9f9347bda7808a949c1ce9fcc2bbf4 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Wed, 29 Sep 2021 11:32:59 +0900 Subject: swiotlb: Support aligned swiotlb buffers Add an argument to swiotlb_tbl_map_single that specifies the desired alignment of the allocated buffer. This is used by dma-iommu to ensure the buffer is aligned to the iova granule size when using swiotlb with untrusted sub-granule mappings. This addresses an issue where adjacent slots could be exposed to the untrusted device if IO_TLB_SIZE < iova granule < PAGE_SIZE. Signed-off-by: David Stevens Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210929023300.335969-7-stevensd@google.com Signed-off-by: Joerg Roedel --- include/linux/swiotlb.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index b0cb2a9973f4..569272871375 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -45,7 +45,8 @@ extern void __init swiotlb_update_mem_attributes(void); phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, size_t mapping_size, size_t alloc_size, - enum dma_data_direction dir, unsigned long attrs); + unsigned int alloc_aligned_mask, enum dma_data_direction dir, + unsigned long attrs); extern void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, -- cgit v1.2.3 From e5af50a5df571c1d0268b02f924de49b742c990f Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 23 Sep 2021 18:06:55 -0700 Subject: arm64: kasan: mte: move GCR_EL1 switch to task switch when KASAN disabled It is not necessary to write to GCR_EL1 on every kernel entry and exit when HW tag-based KASAN is disabled because the kernel will not execute any IRG instructions in that mode. Since accessing GCR_EL1 can be expensive on some microarchitectures, avoid doing so by moving the access to task switch when HW tag-based KASAN is disabled. Signed-off-by: Peter Collingbourne Acked-by: Andrey Konovalov Link: https://linux-review.googlesource.com/id/I78e90d60612a94c24344526f476ac4ff216e10d2 Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20210924010655.2886918-1-pcc@google.com Signed-off-by: Will Deacon --- include/linux/kasan.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index dd874a1ee862..de5f5913374d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -89,7 +89,7 @@ static __always_inline bool kasan_enabled(void) return static_branch_likely(&kasan_flag_enabled); } -static inline bool kasan_has_integrated_init(void) +static inline bool kasan_hw_tags_enabled(void) { return kasan_enabled(); } @@ -104,7 +104,7 @@ static inline bool kasan_enabled(void) return IS_ENABLED(CONFIG_KASAN); } -static inline bool kasan_has_integrated_init(void) +static inline bool kasan_hw_tags_enabled(void) { return false; } @@ -125,6 +125,11 @@ static __always_inline void kasan_free_pages(struct page *page, #endif /* CONFIG_KASAN_HW_TAGS */ +static inline bool kasan_has_integrated_init(void) +{ + return kasan_hw_tags_enabled(); +} + #ifdef CONFIG_KASAN struct kasan_cache { -- cgit v1.2.3 From 7101c83950e629b83f9d827f288063e52074a6ea Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Wed, 29 Sep 2021 19:23:50 -0700 Subject: platform/chrome: cros_usbpd_notify: Move ec_command() cros_ec_command() can be used by other modules too. So, move it to a common location and export it. This patch does not introduce any functional changes. Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210930022403.3358070-3-pmalani@chromium.org --- include/linux/platform_data/cros_ec_proto.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 55844ece0b32..20b17c43caeb 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -231,6 +231,9 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature); int cros_ec_get_sensor_count(struct cros_ec_dev *ec); +int cros_ec_command(struct cros_ec_device *ec_dev, int command, uint8_t *outdata, int outsize, + uint8_t *indata, int insize); + /** * cros_ec_get_time_ns() - Return time in ns. * -- cgit v1.2.3 From 5d122256f4e5900f7f8de5d8787af570314f6701 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Wed, 29 Sep 2021 19:23:52 -0700 Subject: platform/chrome: cros_ec_proto: Make data pointers void Convert the input and output data pointers for cros_ec_command() to void pointers so that the callers don't have to cast their custom structs to uint8_t *. Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210930022403.3358070-4-pmalani@chromium.org --- include/linux/platform_data/cros_ec_proto.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 20b17c43caeb..f833473c5f44 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -231,8 +231,8 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature); int cros_ec_get_sensor_count(struct cros_ec_dev *ec); -int cros_ec_command(struct cros_ec_device *ec_dev, int command, uint8_t *outdata, int outsize, - uint8_t *indata, int insize); +int cros_ec_command(struct cros_ec_device *ec_dev, int command, void *outdata, int outsize, + void *indata, int insize); /** * cros_ec_get_time_ns() - Return time in ns. -- cgit v1.2.3 From 4f1406396ed4d97518b8112327bdaf14fc9d4090 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Wed, 29 Sep 2021 19:23:54 -0700 Subject: platform/chrome: cros_ec_proto: Add version for ec_command Add a version parameter to cros_ec_command() for callers that may want to specify which version of the host command they would like to use. Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210930022403.3358070-5-pmalani@chromium.org --- include/linux/platform_data/cros_ec_proto.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index f833473c5f44..9d370816a419 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -231,8 +231,8 @@ bool cros_ec_check_features(struct cros_ec_dev *ec, int feature); int cros_ec_get_sensor_count(struct cros_ec_dev *ec); -int cros_ec_command(struct cros_ec_device *ec_dev, int command, void *outdata, int outsize, - void *indata, int insize); +int cros_ec_command(struct cros_ec_device *ec_dev, unsigned int version, int command, void *outdata, + int outsize, void *indata, int insize); /** * cros_ec_get_time_ns() - Return time in ns. -- cgit v1.2.3 From 381cecc5d7b777ada7cdf12f5b0bf4caf43bf7aa Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 3 Sep 2021 09:51:38 +0200 Subject: KVM: Drop 'except' parameter from kvm_make_vcpus_request_mask() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both remaining callers of kvm_make_vcpus_request_mask() pass 'NULL' for 'except' parameter so it can just be dropped. No functional change intended ©. Suggested-by: Sean Christopherson Reviewed-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210903075141.403071-6-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0f18df7fe874..89e1a0069833 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -160,7 +160,6 @@ static inline bool is_error_page(struct page *page) #define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0) bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except, unsigned long *vcpu_bitmap, cpumask_var_t tmp); bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, -- cgit v1.2.3 From 620b2438abf98f09e19802cbc3bc2e98179cdbe2 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 3 Sep 2021 09:51:41 +0200 Subject: KVM: Make kvm_make_vcpus_request_mask() use pre-allocated cpu_kick_mask kvm_make_vcpus_request_mask() already disables preemption so just like kvm_make_all_cpus_request_except() it can be switched to using pre-allocated per-cpu cpumasks. This allows for improvements for both users of the function: in Hyper-V emulation code 'tlb_flush' can now be dropped from 'struct kvm_vcpu_hv' and kvm_make_scan_ioapic_request_mask() gets rid of dynamic allocation. cpumask_available() checks in kvm_make_vcpu_request() and kvm_kick_many_cpus() can now be dropped as they checks for an impossible condition: kvm_init() makes sure per-cpu masks are allocated. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini Message-Id: <20210903075141.403071-9-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 89e1a0069833..f1b96a2ebaa7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -160,7 +160,7 @@ static inline bool is_error_page(struct page *page) #define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0) bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, - unsigned long *vcpu_bitmap, cpumask_var_t tmp); + unsigned long *vcpu_bitmap); bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, struct kvm_vcpu *except); -- cgit v1.2.3 From a1c42ddedf35dbf5f25ea0982ed6e226eef7a78c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 13 Sep 2021 15:57:44 +0200 Subject: kvm: rename KVM_MAX_VCPU_ID to KVM_MAX_VCPU_IDS KVM_MAX_VCPU_ID is not specifying the highest allowed vcpu-id, but the number of allowed vcpu-ids. This has already led to confusion, so rename KVM_MAX_VCPU_ID to KVM_MAX_VCPU_IDS to make its semantics more clear Suggested-by: Eduardo Habkost Signed-off-by: Juergen Gross Signed-off-by: Paolo Bonzini Message-Id: <20210913135745.13944-3-jgross@suse.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f1b96a2ebaa7..1f9e80ce4723 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -39,8 +39,8 @@ #include #include -#ifndef KVM_MAX_VCPU_ID -#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS +#ifndef KVM_MAX_VCPU_IDS +#define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS #endif /* -- cgit v1.2.3 From 515a0c79e7963cc4556ca61516cc09d39e592712 Mon Sep 17 00:00:00 2001 From: "Longpeng(Mike)" Date: Fri, 27 Aug 2021 16:00:03 +0800 Subject: kvm: irqfd: avoid update unmodified entries of the routing All of the irqfds would to be updated when update the irq routing, it's too expensive if there're too many irqfds. However we can reduce the cost by avoid some unnecessary updates. For irqs of MSI type on X86, the update can be saved if the msi values are not change. The vfio migration could receives benefit from this optimi- zaiton. The test VM has 128 vcpus and 8 VF (with 65 vectors enabled), so the VM has more than 520 irqfds. We mesure the cost of the vfio_msix_enable (in QEMU, it would set routing for each irqfd) for each VF, and we can see the total cost can be significantly reduced. Origin Apply this Patch 1st 8 4 2nd 15 5 3rd 22 6 4th 24 6 5th 36 7 6th 44 7 7th 51 8 8th 58 8 Total 258ms 51ms We're also tring to optimize the QEMU part [1], but it's still worth to optimize the KVM to gain more benefits. [1] https://lists.gnu.org/archive/html/qemu-devel/2021-08/msg04215.html Signed-off-by: Longpeng(Mike) Message-Id: <20210827080003.2689-1-longpeng2@huawei.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1f9e80ce4723..3f87d6ad20bf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1764,6 +1764,8 @@ void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *); void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *); int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); +bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *, + struct kvm_kernel_irq_routing_entry *); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS -- cgit v1.2.3 From bcf9033e5449bdcaa9bed46467a7141a8049dadb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 14 Sep 2021 14:10:33 +0200 Subject: sched: move CPU field back into thread_info if THREAD_INFO_IN_TASK=y THREAD_INFO_IN_TASK moved the CPU field out of thread_info, but this causes some issues on architectures that define raw_smp_processor_id() in terms of this field, due to the fact that #include'ing linux/sched.h to get at struct task_struct is problematic in terms of circular dependencies. Given that thread_info and task_struct are the same data structure anyway when THREAD_INFO_IN_TASK=y, let's move it back so that having access to the type definition of struct thread_info is sufficient to reference the CPU number of the current task. Note that this requires THREAD_INFO_IN_TASK's definition of the task_thread_info() helper to be updated, as task_cpu() takes a pointer-to-const, whereas task_thread_info() (which is used to generate lvalues as well), needs a non-const pointer. So make it a macro instead. Signed-off-by: Ard Biesheuvel Acked-by: Catalin Marinas Acked-by: Mark Rutland Acked-by: Michael Ellerman --- include/linux/sched.h | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 39039ce8ac4c..8699594e3f99 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -750,10 +750,6 @@ struct task_struct { #ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; -#ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; -#endif unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; @@ -1886,10 +1882,7 @@ extern struct thread_info init_thread_info; extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK -static inline struct thread_info *task_thread_info(struct task_struct *task) -{ - return &task->thread_info; -} +# define task_thread_info(task) (&(task)->thread_info) #elif !defined(__HAVE_THREAD_FUNCTIONS) # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif @@ -2114,11 +2107,7 @@ static __always_inline bool need_resched(void) static inline unsigned int task_cpu(const struct task_struct *p) { -#ifdef CONFIG_THREAD_INFO_IN_TASK - return READ_ONCE(p->cpu); -#else return READ_ONCE(task_thread_info(p)->cpu); -#endif } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); -- cgit v1.2.3 From 38a68934aa72459217986cf6b461d87f7602648a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 24 Sep 2021 17:56:51 +0200 Subject: vfio: Move vfio_iommu_group_get() to vfio_register_group_dev() We don't need to hold a reference to the group in the driver as well as obtain a reference to the same group as the first thing vfio_register_group_dev() does. Since the drivers never use the group move this all into the core code. Signed-off-by: Jason Gunthorpe Signed-off-by: Christoph Hellwig Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-2-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/vfio.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index b53a9557884a..f7083c2fd0d0 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -71,9 +71,6 @@ struct vfio_device_ops { int (*match)(struct vfio_device *vdev, char *buf); }; -extern struct iommu_group *vfio_iommu_group_get(struct device *dev); -extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev); - void vfio_init_group_dev(struct vfio_device *device, struct device *dev, const struct vfio_device_ops *ops); void vfio_uninit_group_dev(struct vfio_device *device); -- cgit v1.2.3 From c68ea0d00ad82428154aed890ec9f793e460fa1c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:56:57 +0200 Subject: vfio: simplify iommu group allocation for mediated devices Reuse the logic in vfio_noiommu_group_alloc to allocate a fake single-device iommu group for mediated devices by factoring out a common function, and replacing the noiommu boolean field in struct vfio_group with an enum to distinguish the three different kinds of groups. Signed-off-by: Christoph Hellwig Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-8-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/vfio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index f7083c2fd0d0..bbe293008626 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -75,6 +75,7 @@ void vfio_init_group_dev(struct vfio_device *device, struct device *dev, const struct vfio_device_ops *ops); void vfio_uninit_group_dev(struct vfio_device *device); int vfio_register_group_dev(struct vfio_device *device); +int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); extern void vfio_device_put(struct vfio_device *device); -- cgit v1.2.3 From 67462037872d5ca57dc4674cccff191947b9b43e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:56:58 +0200 Subject: vfio: remove unused method from vfio_iommu_driver_ops The read, write and mmap methods are never implemented, so remove them. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-9-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/vfio.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index bbe293008626..7a57a0077f96 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -95,13 +95,8 @@ struct vfio_iommu_driver_ops { struct module *owner; void *(*open)(unsigned long arg); void (*release)(void *iommu_data); - ssize_t (*read)(void *iommu_data, char __user *buf, - size_t count, loff_t *ppos); - ssize_t (*write)(void *iommu_data, const char __user *buf, - size_t count, loff_t *size); long (*ioctl)(void *iommu_data, unsigned int cmd, unsigned long arg); - int (*mmap)(void *iommu_data, struct vm_area_struct *vma); int (*attach_group)(void *iommu_data, struct iommu_group *group); void (*detach_group)(void *iommu_data, -- cgit v1.2.3 From 8cc02d22d7e1596ed687c4ff967c32056c2bef3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:56:59 +0200 Subject: vfio: move the vfio_iommu_driver_ops interface out of Create a new private drivers/vfio/vfio.h header for the interface between the VFIO core and the iommu drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-10-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/vfio.h | 44 -------------------------------------------- 1 file changed, 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 7a57a0077f96..76191d7abed1 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -82,50 +82,6 @@ extern void vfio_device_put(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); -/* events for the backend driver notify callback */ -enum vfio_iommu_notify_type { - VFIO_IOMMU_CONTAINER_CLOSE = 0, -}; - -/** - * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks - */ -struct vfio_iommu_driver_ops { - char *name; - struct module *owner; - void *(*open)(unsigned long arg); - void (*release)(void *iommu_data); - long (*ioctl)(void *iommu_data, unsigned int cmd, - unsigned long arg); - int (*attach_group)(void *iommu_data, - struct iommu_group *group); - void (*detach_group)(void *iommu_data, - struct iommu_group *group); - int (*pin_pages)(void *iommu_data, - struct iommu_group *group, - unsigned long *user_pfn, - int npage, int prot, - unsigned long *phys_pfn); - int (*unpin_pages)(void *iommu_data, - unsigned long *user_pfn, int npage); - int (*register_notifier)(void *iommu_data, - unsigned long *events, - struct notifier_block *nb); - int (*unregister_notifier)(void *iommu_data, - struct notifier_block *nb); - int (*dma_rw)(void *iommu_data, dma_addr_t user_iova, - void *data, size_t count, bool write); - struct iommu_domain *(*group_iommu_domain)(void *iommu_data, - struct iommu_group *group); - void (*notify)(void *iommu_data, - enum vfio_iommu_notify_type event); -}; - -extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); - -extern void vfio_unregister_iommu_driver( - const struct vfio_iommu_driver_ops *ops); - /* * External user API */ -- cgit v1.2.3 From fda49d97f2c4faf0b42e8796d3e6c868d992f3af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:57:00 +0200 Subject: vfio: remove the unused mdev iommu hook The iommu_device field in struct mdev_device has never been used since it was added more than 2 years ago. This is a manual revert of commit 7bd50f0cd2 ("vfio/type1: Add domain at(de)taching group helpers"). Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-11-hch@lst.de Signed-off-by: Alex Williamson --- include/linux/mdev.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 68427e8fadeb..15d03f6532d0 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -18,7 +18,6 @@ struct mdev_device { void *driver_data; struct list_head next; struct mdev_type *type; - struct device *iommu_device; bool active; }; @@ -27,25 +26,6 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) return container_of(dev, struct mdev_device, dev); } -/* - * Called by the parent device driver to set the device which represents - * this mdev in iommu protection scope. By default, the iommu device is - * NULL, that indicates using vendor defined isolation. - * - * @dev: the mediated device that iommu will isolate. - * @iommu_device: a pci device which represents the iommu for @dev. - */ -static inline void mdev_set_iommu_device(struct mdev_device *mdev, - struct device *iommu_device) -{ - mdev->iommu_device = iommu_device; -} - -static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev) -{ - return mdev->iommu_device; -} - unsigned int mdev_get_type_group_id(struct mdev_device *mdev); unsigned int mtype_get_type_group_id(struct mdev_type *mtype); struct device *mtype_get_parent_dev(struct mdev_type *mtype); -- cgit v1.2.3 From 02afb8d6048d6526619e6e2dcdc95ce9c2bdb52f Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:38:57 +0900 Subject: kprobe: Simplify prepare_kprobe() by dropping redundant version The function prepare_kprobe() is called during kprobe registration and is responsible for ensuring any architecture related preparation for the kprobe is done before returning. One of two versions of prepare_kprobe() is chosen depending on the availability of KPROBE_ON_FTRACE in the kernel configuration. Simplify the code by dropping the version when KPROBE_ON_FTRACE is not selected - instead relying on kprobe_ftrace() to return false when KPROBE_ON_FTRACE is not set. No functional change. Link: https://lkml.kernel.org/r/163163033696.489837.9264661820279300788.stgit@devnote2 Signed-off-by: Punit Agrawal Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e4f3bfe08757..0b75549b2815 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -354,6 +354,11 @@ static inline void wait_for_kprobe_optimizer(void) { } extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs); extern int arch_prepare_kprobe_ftrace(struct kprobe *p); +#else +static inline int arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + return -EINVAL; +} #endif int arch_check_ftrace_location(struct kprobe *p); -- cgit v1.2.3 From 4402deae8993fb0e25a19bb999b38df13e25a7e0 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:39:16 +0900 Subject: kprobes: Make arch_check_ftrace_location static arch_check_ftrace_location() was introduced as a weak function in commit f7f242ff004499 ("kprobes: introduce weak arch_check_ftrace_location() helper function") to allow architectures to handle kprobes call site on their own. Recently, the only architecture (csky) to implement arch_check_ftrace_location() was migrated to using the common version. As a result, further cleanup the code to drop the weak attribute and rename the function to remove the architecture specific implementation. Link: https://lkml.kernel.org/r/163163035673.489837.2367816318195254104.stgit@devnote2 Signed-off-by: Punit Agrawal Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0b75549b2815..8a9412bb0d5e 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -361,8 +361,6 @@ static inline int arch_prepare_kprobe_ftrace(struct kprobe *p) } #endif -int arch_check_ftrace_location(struct kprobe *p); - /* Get the kprobe at this addr (if any) - called with preemption disabled */ struct kprobe *get_kprobe(void *addr); -- cgit v1.2.3 From 223a76b268c9cfa265d454879ae09e2c9c808f87 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:39:34 +0900 Subject: kprobes: Fix coding style issues Fix coding style issues reported by checkpatch.pl and update comments to quote variable names and add "()" to function name. One TODO comment in __disarm_kprobe() is removed because it has been done by following commit. Link: https://lkml.kernel.org/r/163163037468.489837.4282347782492003960.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 8a9412bb0d5e..756d3d23ce37 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -3,7 +3,6 @@ #define _LINUX_KPROBES_H /* * Kernel Probes (KProbes) - * include/linux/kprobes.h * * Copyright (C) IBM Corporation, 2002, 2004 * @@ -39,7 +38,7 @@ #define KPROBE_REENTER 0x00000004 #define KPROBE_HIT_SSDONE 0x00000008 -#else /* CONFIG_KPROBES */ +#else /* !CONFIG_KPROBES */ #include typedef int kprobe_opcode_t; struct arch_specific_insn { @@ -228,7 +227,7 @@ static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance return READ_ONCE(ri->rph->rp); } -#else /* CONFIG_KRETPROBES */ +#else /* !CONFIG_KRETPROBES */ static inline void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { @@ -239,11 +238,15 @@ static inline int arch_trampoline_kprobe(struct kprobe *p) } #endif /* CONFIG_KRETPROBES */ +/* Markers of '_kprobe_blacklist' section */ +extern unsigned long __start_kprobe_blacklist[]; +extern unsigned long __stop_kprobe_blacklist[]; + extern struct kretprobe_blackpoint kretprobe_blacklist[]; #ifdef CONFIG_KPROBES_SANITY_TEST extern int init_test_probes(void); -#else +#else /* !CONFIG_KPROBES_SANITY_TEST */ static inline int init_test_probes(void) { return 0; @@ -303,7 +306,7 @@ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ #define KPROBE_OPTINSN_PAGE_SYM "kprobe_optinsn_page" int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, unsigned long *value, char *type, char *sym); -#else /* __ARCH_WANT_KPROBES_INSN_SLOT */ +#else /* !__ARCH_WANT_KPROBES_INSN_SLOT */ #define DEFINE_INSN_CACHE_OPS(__name) \ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ { \ @@ -345,11 +348,12 @@ extern int sysctl_kprobes_optimization; extern int proc_kprobes_optimization_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); -#endif +#endif /* CONFIG_SYSCTL */ extern void wait_for_kprobe_optimizer(void); -#else +#else /* !CONFIG_OPTPROBES */ static inline void wait_for_kprobe_optimizer(void) { } #endif /* CONFIG_OPTPROBES */ + #ifdef CONFIG_KPROBES_ON_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs); @@ -359,7 +363,7 @@ static inline int arch_prepare_kprobe_ftrace(struct kprobe *p) { return -EINVAL; } -#endif +#endif /* CONFIG_KPROBES_ON_FTRACE */ /* Get the kprobe at this addr (if any) - called with preemption disabled */ struct kprobe *get_kprobe(void *addr); @@ -367,7 +371,7 @@ struct kprobe *get_kprobe(void *addr); /* kprobe_running() will just return the current_kprobe on this CPU */ static inline struct kprobe *kprobe_running(void) { - return (__this_cpu_read(current_kprobe)); + return __this_cpu_read(current_kprobe); } static inline void reset_current_kprobe(void) @@ -431,11 +435,11 @@ static inline struct kprobe *kprobe_running(void) } static inline int register_kprobe(struct kprobe *p) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline int register_kprobes(struct kprobe **kps, int num) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline void unregister_kprobe(struct kprobe *p) { @@ -445,11 +449,11 @@ static inline void unregister_kprobes(struct kprobe **kps, int num) } static inline int register_kretprobe(struct kretprobe *rp) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline int register_kretprobes(struct kretprobe **rps, int num) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline void unregister_kretprobe(struct kretprobe *rp) { @@ -465,11 +469,11 @@ static inline void kprobe_free_init_mem(void) } static inline int disable_kprobe(struct kprobe *kp) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline int enable_kprobe(struct kprobe *kp) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline bool within_kprobe_blacklist(unsigned long addr) @@ -482,6 +486,7 @@ static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, return -ERANGE; } #endif /* CONFIG_KPROBES */ + static inline int disable_kretprobe(struct kretprobe *rp) { return disable_kprobe(&rp->kp); @@ -496,13 +501,14 @@ static inline bool is_kprobe_insn_slot(unsigned long addr) { return false; } -#endif +#endif /* !CONFIG_KPROBES */ + #ifndef CONFIG_OPTPROBES static inline bool is_kprobe_optinsn_slot(unsigned long addr) { return false; } -#endif +#endif /* !CONFIG_OPTPROBES */ /* Returns true if kprobes handled the fault */ static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs, -- cgit v1.2.3 From dfc05b55c3c6b15dbd889e9901ecb3fb695421bd Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:39:46 +0900 Subject: kprobes: Use IS_ENABLED() instead of kprobes_built_in() Use IS_ENABLED(CONFIG_KPROBES) instead of kprobes_built_in(). This inline function is introduced only for avoiding #ifdef. But since now we have IS_ENABLED(), it is no longer needed. Link: https://lkml.kernel.org/r/163163038581.489837.2805250706507372658.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 756d3d23ce37..9c28fbb18e74 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -180,14 +180,6 @@ struct kprobe_blacklist_entry { DECLARE_PER_CPU(struct kprobe *, current_kprobe); DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -/* - * For #ifdef avoidance: - */ -static inline int kprobes_built_in(void) -{ - return 1; -} - extern void kprobe_busy_begin(void); extern void kprobe_busy_end(void); @@ -417,10 +409,6 @@ int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, char *type, char *sym); #else /* !CONFIG_KPROBES: */ -static inline int kprobes_built_in(void) -{ - return 0; -} static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { return 0; @@ -514,7 +502,7 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr) static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs, unsigned int trap) { - if (!kprobes_built_in()) + if (!IS_ENABLED(CONFIG_KPROBES)) return false; if (user_mode(regs)) return false; -- cgit v1.2.3 From c42421e205fc2570a4d019184ea7d6c382c93f4c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:07 +0900 Subject: kprobes: treewide: Use 'kprobe_opcode_t *' for the code address in get_optimized_kprobe() Since get_optimized_kprobe() is only used inside kprobes, it doesn't need to use 'unsigned long' type for 'addr' parameter. Make it use 'kprobe_opcode_t *' for the 'addr' parameter and subsequent call of arch_within_optimized_kprobe() also should use 'kprobe_opcode_t *'. Note that MAX_OPTIMIZED_LENGTH and RELATIVEJUMP_SIZE are defined by byte-size, but the size of 'kprobe_opcode_t' depends on the architecture. Therefore, we must be careful when calculating addresses using those macros. Link: https://lkml.kernel.org/r/163163040680.489837.12133032364499833736.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 9c28fbb18e74..6a5995f334a0 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -329,7 +329,7 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_list); extern void arch_unoptimize_kprobe(struct optimized_kprobe *op); extern int arch_within_optimized_kprobe(struct optimized_kprobe *op, - unsigned long addr); + kprobe_opcode_t *addr); extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); -- cgit v1.2.3 From 29e8077ae2beea6a85ad2d0bae9c550bd5d05ed9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:16 +0900 Subject: kprobes: Use bool type for functions which returns boolean value Use the 'bool' type instead of 'int' for the functions which returns a boolean value, because this makes clear that those functions don't return any error code. Link: https://lkml.kernel.org/r/163163041649.489837.17311187321419747536.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 6a5995f334a0..0ba3f9e316d4 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -104,25 +104,25 @@ struct kprobe { #define KPROBE_FLAG_FTRACE 8 /* probe is using ftrace */ /* Has this kprobe gone ? */ -static inline int kprobe_gone(struct kprobe *p) +static inline bool kprobe_gone(struct kprobe *p) { return p->flags & KPROBE_FLAG_GONE; } /* Is this kprobe disabled ? */ -static inline int kprobe_disabled(struct kprobe *p) +static inline bool kprobe_disabled(struct kprobe *p) { return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE); } /* Is this kprobe really running optimized path ? */ -static inline int kprobe_optimized(struct kprobe *p) +static inline bool kprobe_optimized(struct kprobe *p) { return p->flags & KPROBE_FLAG_OPTIMIZED; } /* Is this kprobe uses ftrace ? */ -static inline int kprobe_ftrace(struct kprobe *p) +static inline bool kprobe_ftrace(struct kprobe *p) { return p->flags & KPROBE_FLAG_FTRACE; } -- cgit v1.2.3 From f2ec8d9a3b8c0f22cd6a2b4f5a2d9aee5206e3b7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:36 +0900 Subject: kprobes: treewide: Replace arch_deref_entry_point() with dereference_symbol_descriptor() ~15 years ago kprobes grew the 'arch_deref_entry_point()' __weak function: 3d7e33825d87: ("jprobes: make jprobes a little safer for users") But this is just open-coded dereference_symbol_descriptor() in essence, and its obscure nature was causing bugs. Just use the real thing and remove arch_deref_entry_point(). Link: https://lkml.kernel.org/r/163163043630.489837.7924988885652708696.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0ba3f9e316d4..2ed61fcbc89c 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -381,7 +381,6 @@ int register_kprobe(struct kprobe *p); void unregister_kprobe(struct kprobe *p); int register_kprobes(struct kprobe **kps, int num); void unregister_kprobes(struct kprobe **kps, int num); -unsigned long arch_deref_entry_point(void *); int register_kretprobe(struct kretprobe *rp); void unregister_kretprobe(struct kretprobe *rp); -- cgit v1.2.3 From 96fed8ac2bb64ab45497fdd8e3d390165b7a9be8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:45 +0900 Subject: kprobes: treewide: Remove trampoline_address from kretprobe_trampoline_handler() The __kretprobe_trampoline_handler() callback, called from low level arch kprobes methods, has the 'trampoline_address' parameter, which is entirely superfluous as it basically just replicates: dereference_kernel_function_descriptor(kretprobe_trampoline) In fact we had bugs in arch code where it wasn't replicated correctly. So remove this superfluous parameter and use kretprobe_trampoline_addr() instead. Link: https://lkml.kernel.org/r/163163044546.489837.13505751885476015002.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 2ed61fcbc89c..96f5df93e36e 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -188,15 +188,23 @@ extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); extern int arch_trampoline_kprobe(struct kprobe *p); +void kretprobe_trampoline(void); +/* + * Since some architecture uses structured function pointer, + * use dereference_function_descriptor() to get real function address. + */ +static nokprobe_inline void *kretprobe_trampoline_addr(void) +{ + return dereference_kernel_function_descriptor(kretprobe_trampoline); +} + /* If the trampoline handler called from a kprobe, use this version */ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, - void *trampoline_address, - void *frame_pointer); + void *frame_pointer); static nokprobe_inline unsigned long kretprobe_trampoline_handler(struct pt_regs *regs, - void *trampoline_address, - void *frame_pointer) + void *frame_pointer) { unsigned long ret; /* @@ -205,7 +213,7 @@ unsigned long kretprobe_trampoline_handler(struct pt_regs *regs, * be running at this point. */ kprobe_busy_begin(); - ret = __kretprobe_trampoline_handler(regs, trampoline_address, frame_pointer); + ret = __kretprobe_trampoline_handler(regs, frame_pointer); kprobe_busy_end(); return ret; -- cgit v1.2.3 From adf8a61a940c49fea6fab9c3865f2b69b8ceef28 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:54 +0900 Subject: kprobes: treewide: Make it harder to refer kretprobe_trampoline directly Since now there is kretprobe_trampoline_addr() for referring the address of kretprobe trampoline code, we don't need to access kretprobe_trampoline directly. Make it harder to refer by renaming it to __kretprobe_trampoline(). Link: https://lkml.kernel.org/r/163163045446.489837.14510577516938803097.stgit@devnote2 Suggested-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 96f5df93e36e..b6b2370f4a4c 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -188,14 +188,14 @@ extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); extern int arch_trampoline_kprobe(struct kprobe *p); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); /* * Since some architecture uses structured function pointer, * use dereference_function_descriptor() to get real function address. */ static nokprobe_inline void *kretprobe_trampoline_addr(void) { - return dereference_kernel_function_descriptor(kretprobe_trampoline); + return dereference_kernel_function_descriptor(__kretprobe_trampoline); } /* If the trampoline handler called from a kprobe, use this version */ -- cgit v1.2.3 From 03bac0df2886882c43e6d0bfff9dee84a184fc7e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:41:04 +0900 Subject: kprobes: Add kretprobe_find_ret_addr() for searching return address Introduce kretprobe_find_ret_addr() and is_kretprobe_trampoline(). These APIs will be used by the ORC stack unwinder and ftrace, so that they can check whether the given address points kretprobe trampoline code and query the correct return address in that case. Link: https://lkml.kernel.org/r/163163046461.489837.1044778356430293962.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index b6b2370f4a4c..6d47a9da1e0a 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -505,6 +505,28 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr) } #endif /* !CONFIG_OPTPROBES */ +#ifdef CONFIG_KRETPROBES +static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) +{ + return (void *)addr == kretprobe_trampoline_addr(); +} + +unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, + struct llist_node **cur); +#else +static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) +{ + return false; +} + +static nokprobe_inline +unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, + struct llist_node **cur) +{ + return 0; +} +#endif + /* Returns true if kprobes handled the fault */ static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs, unsigned int trap) -- cgit v1.2.3 From e028c4f7ac7ca8c96126fe46c54ab3d56ffe6a66 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 14 Sep 2021 23:41:13 +0900 Subject: objtool: Add frame-pointer-specific function ignore Add a CONFIG_FRAME_POINTER-specific version of STACK_FRAME_NON_STANDARD() for the case where a function is intentionally missing frame pointer setup, but otherwise needs objtool/ORC coverage when frame pointers are disabled. Link: https://lkml.kernel.org/r/163163047364.489837.17377799909553689661.stgit@devnote2 Signed-off-by: Josh Poimboeuf Reviewed-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/objtool.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 7e72d975cb76..aca52db2f3f3 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -66,6 +66,17 @@ struct unwind_hint { static void __used __section(".discard.func_stack_frame_non_standard") \ *__func_stack_frame_non_standard_##func = func +/* + * STACK_FRAME_NON_STANDARD_FP() is a frame-pointer-specific function ignore + * for the case where a function is intentionally missing frame pointer setup, + * but otherwise needs objtool/ORC coverage when frame pointers are disabled. + */ +#ifdef CONFIG_FRAME_POINTER +#define STACK_FRAME_NON_STANDARD_FP(func) STACK_FRAME_NON_STANDARD(func) +#else +#define STACK_FRAME_NON_STANDARD_FP(func) +#endif + #else /* __ASSEMBLY__ */ /* @@ -127,6 +138,7 @@ struct unwind_hint { #define UNWIND_HINT(sp_reg, sp_offset, type, end) \ "\n\t" #define STACK_FRAME_NON_STANDARD(func) +#define STACK_FRAME_NON_STANDARD_FP(func) #else #define ANNOTATE_INTRA_FUNCTION_CALL .macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 -- cgit v1.2.3 From bf094cffea2a6503ce84062f9f0243bef77c58f9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:42:51 +0900 Subject: x86/kprobes: Fixup return address in generic trampoline handler In x86, the fake return address on the stack saved by __kretprobe_trampoline() will be replaced with the real return address after returning from trampoline_handler(). Before fixing the return address, the real return address can be found in the 'current->kretprobe_instances'. However, since there is a window between updating the 'current->kretprobe_instances' and fixing the address on the stack, if an interrupt happens at that timing and the interrupt handler does stacktrace, it may fail to unwind because it can not get the correct return address from 'current->kretprobe_instances'. This will eliminate that window by fixing the return address right before updating 'current->kretprobe_instances'. Link: https://lkml.kernel.org/r/163163057094.489837.9044470370440745866.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 6d47a9da1e0a..e974caf39d3e 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -188,6 +188,9 @@ extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); extern int arch_trampoline_kprobe(struct kprobe *p); +void arch_kretprobe_fixup_return(struct pt_regs *regs, + kprobe_opcode_t *correct_ret_addr); + void __kretprobe_trampoline(void); /* * Since some architecture uses structured function pointer, -- cgit v1.2.3 From 78b497f2e62d8c7514de5f83c80837bbb120e93e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 3 Sep 2021 15:08:05 +0200 Subject: kvm: use kvfree() in kvm_arch_free_vm() By switching from kfree() to kvfree() in kvm_arch_free_vm() Arm64 can use the common variant. This can be accomplished by adding another macro __KVM_HAVE_ARCH_VM_FREE, which will be used only by x86 for now. Further simplification can be achieved by adding __kvm_arch_free_vm() doing the common part. Suggested-by: Paolo Bonzini Signed-off-by: Juergen Gross Message-Id: <20210903130808.30142-5-jgross@suse.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3f87d6ad20bf..60a35d9fe259 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1081,10 +1081,17 @@ static inline struct kvm *kvm_arch_alloc_vm(void) { return kzalloc(sizeof(struct kvm), GFP_KERNEL); } +#endif + +static inline void __kvm_arch_free_vm(struct kvm *kvm) +{ + kvfree(kvm); +} +#ifndef __KVM_HAVE_ARCH_VM_FREE static inline void kvm_arch_free_vm(struct kvm *kvm) { - kfree(kvm); + __kvm_arch_free_vm(kvm); } #endif -- cgit v1.2.3 From 874f670e6088d3bff3972ecd44c1cb00610f9183 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Sep 2021 18:54:35 +0200 Subject: sched: Clean up the might_sleep() underscore zoo __might_sleep() vs. ___might_sleep() is hard to distinguish. Aside of that the three underscore variant is exposed to provide a checkpoint for rescheduling points which are distinct from blocking points. They are semantically a preemption point which means that scheduling is state preserving. A real blocking operation, e.g. mutex_lock(), wait*(), which cannot preserve a task state which is not equal to RUNNING. While technically blocking on a "sleeping" spinlock in RT enabled kernels falls into the voluntary scheduling category because it has to wait until the contended spin/rw lock becomes available, the RT lock substitution code can semantically be mapped to a voluntary preemption because the RT lock substitution code and the scheduler are providing mechanisms to preserve the task state and to take regular non-lock related wakeups into account. Rename ___might_sleep() to __might_resched() to make the distinction of these functions clear. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210923165357.928693482@linutronix.de --- include/linux/kernel.h | 6 +++--- include/linux/sched.h | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2776423a587e..5e4ae54da73e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -111,7 +111,7 @@ static __always_inline void might_resched(void) #endif /* CONFIG_PREEMPT_* */ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -extern void ___might_sleep(const char *file, int line, int preempt_offset); +extern void __might_resched(const char *file, int line, int preempt_offset); extern void __might_sleep(const char *file, int line, int preempt_offset); extern void __cant_sleep(const char *file, int line, int preempt_offset); extern void __cant_migrate(const char *file, int line); @@ -168,8 +168,8 @@ extern void __cant_migrate(const char *file, int line); */ # define non_block_end() WARN_ON(current->non_block_count-- == 0) #else - static inline void ___might_sleep(const char *file, int line, - int preempt_offset) { } + static inline void __might_resched(const char *file, int line, + int preempt_offset) { } static inline void __might_sleep(const char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index e12b524426b0..b38f002334d5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2038,7 +2038,7 @@ static inline int _cond_resched(void) { return 0; } #endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ #define cond_resched() ({ \ - ___might_sleep(__FILE__, __LINE__, 0); \ + __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) @@ -2046,9 +2046,9 @@ extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); -#define cond_resched_lock(lock) ({ \ - ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ - __cond_resched_lock(lock); \ +#define cond_resched_lock(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + __cond_resched_lock(lock); \ }) #define cond_resched_rwlock_read(lock) ({ \ -- cgit v1.2.3 From 7b5ff4bb9adc53cfbf7ac9ba7820ccf0cd7c070a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Sep 2021 18:54:37 +0200 Subject: sched: Make cond_resched_*lock() variants consistent vs. might_sleep() Commit 3427445afd26 ("sched: Exclude cond_resched() from nested sleep test") removed the task state check of __might_sleep() for cond_resched_lock() because cond_resched_lock() is not a voluntary scheduling point which blocks. It's a preemption point which requires the lock holder to release the spin lock. The same rationale applies to cond_resched_rwlock_read/write(), but those were not touched. Make it consistent and use the non-state checking __might_resched() there as well. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210923165357.991262778@linutronix.de --- include/linux/sched.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index b38f002334d5..7a989f2487f8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2051,14 +2051,14 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock); __cond_resched_lock(lock); \ }) -#define cond_resched_rwlock_read(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_rwlock_read(lock); \ +#define cond_resched_rwlock_read(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + __cond_resched_rwlock_read(lock); \ }) -#define cond_resched_rwlock_write(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_rwlock_write(lock); \ +#define cond_resched_rwlock_write(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + __cond_resched_rwlock_write(lock); \ }) static inline void cond_resched_rcu(void) -- cgit v1.2.3 From 42a387566c567603bafa1ec0c5b71c35cba83e86 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Sep 2021 18:54:38 +0200 Subject: sched: Remove preempt_offset argument from __might_sleep() All callers hand in 0 and never will hand in anything else. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210923165358.054321586@linutronix.de --- include/linux/kernel.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5e4ae54da73e..f95ee786e4ef 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -112,7 +112,7 @@ static __always_inline void might_resched(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP extern void __might_resched(const char *file, int line, int preempt_offset); -extern void __might_sleep(const char *file, int line, int preempt_offset); +extern void __might_sleep(const char *file, int line); extern void __cant_sleep(const char *file, int line, int preempt_offset); extern void __cant_migrate(const char *file, int line); @@ -129,7 +129,7 @@ extern void __cant_migrate(const char *file, int line); * supposed to. */ # define might_sleep() \ - do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) + do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) /** * cant_sleep - annotation for functions that cannot sleep * @@ -170,8 +170,7 @@ extern void __cant_migrate(const char *file, int line); #else static inline void __might_resched(const char *file, int line, int preempt_offset) { } - static inline void __might_sleep(const char *file, int line, - int preempt_offset) { } +static inline void __might_sleep(const char *file, int line) { } # define might_sleep() do { might_resched(); } while (0) # define cant_sleep() do { } while (0) # define cant_migrate() do { } while (0) -- cgit v1.2.3 From 50e081b96e35e43b65591f40f7376204decd1cb5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Sep 2021 18:54:43 +0200 Subject: sched: Make RCU nest depth distinct in __might_resched() For !RT kernels RCU nest depth in __might_resched() is always expected to be 0, but on RT kernels it can be non zero while the preempt count is expected to be always 0. Instead of playing magic games in interpreting the 'preempt_offset' argument, rename it to 'offsets' and use the lower 8 bits for the expected preempt count, allow to hand in the expected RCU nest depth in the upper bits and adopt the __might_resched() code and related checks and printks. The affected call sites are updated in subsequent steps. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210923165358.243232823@linutronix.de --- include/linux/kernel.h | 4 ++-- include/linux/sched.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f95ee786e4ef..e8696e4a45aa 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -111,7 +111,7 @@ static __always_inline void might_resched(void) #endif /* CONFIG_PREEMPT_* */ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -extern void __might_resched(const char *file, int line, int preempt_offset); +extern void __might_resched(const char *file, int line, unsigned int offsets); extern void __might_sleep(const char *file, int line); extern void __cant_sleep(const char *file, int line, int preempt_offset); extern void __cant_migrate(const char *file, int line); @@ -169,7 +169,7 @@ extern void __cant_migrate(const char *file, int line); # define non_block_end() WARN_ON(current->non_block_count-- == 0) #else static inline void __might_resched(const char *file, int line, - int preempt_offset) { } + unsigned int offsets) { } static inline void __might_sleep(const char *file, int line) { } # define might_sleep() do { might_resched(); } while (0) # define cant_sleep() do { } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 7a989f2487f8..b448c7460577 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2046,6 +2046,9 @@ extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); +#define MIGHT_RESCHED_RCU_SHIFT 8 +#define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) + #define cond_resched_lock(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ __cond_resched_lock(lock); \ -- cgit v1.2.3 From 3e9cc688e56cc2abb9b6067f57c8397f6c96d42c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Sep 2021 18:54:44 +0200 Subject: sched: Make cond_resched_lock() variants RT aware The __might_resched() checks in the cond_resched_lock() variants use PREEMPT_LOCK_OFFSET for preempt count offset checking which takes the preemption disable by the spin_lock() which is still held at that point into account. On PREEMPT_RT enabled kernels spin/rw_lock held sections stay preemptible which means PREEMPT_LOCK_OFFSET is 0, but that still triggers the __might_resched() check because that takes RCU read side nesting into account. On RT enabled kernels spin/read/write_lock() issue rcu_read_lock() to resemble the !RT semantics, which means in cond_resched_lock() the might resched check will see preempt_count() == 0 and rcu_preempt_depth() == 1. Introduce PREEMPT_LOCK_SCHED_OFFSET for those might resched checks and map them depending on CONFIG_PREEMPT_RT. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210923165358.305969211@linutronix.de --- include/linux/preempt.h | 5 +++-- include/linux/sched.h | 34 +++++++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4d244e295e85..031898b38d06 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -122,9 +122,10 @@ * The preempt_count offset after spin_lock() */ #if !defined(CONFIG_PREEMPT_RT) -#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET +#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET #else -#define PREEMPT_LOCK_OFFSET 0 +/* Locks on RT do not disable preemption */ +#define PREEMPT_LOCK_OFFSET 0 #endif /* diff --git a/include/linux/sched.h b/include/linux/sched.h index b448c7460577..21b7cd00bf1d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2049,19 +2049,35 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock); #define MIGHT_RESCHED_RCU_SHIFT 8 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) -#define cond_resched_lock(lock) ({ \ - __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_lock(lock); \ +#ifndef CONFIG_PREEMPT_RT +/* + * Non RT kernels have an elevated preempt count due to the held lock, + * but are not allowed to be inside a RCU read side critical section + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET +#else +/* + * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in + * cond_resched*lock() has to take that into account because it checks for + * preempt_count() and rcu_preempt_depth(). + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS \ + (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) +#endif + +#define cond_resched_lock(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_lock(lock); \ }) -#define cond_resched_rwlock_read(lock) ({ \ - __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_rwlock_read(lock); \ +#define cond_resched_rwlock_read(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_read(lock); \ }) -#define cond_resched_rwlock_write(lock) ({ \ - __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_rwlock_write(lock); \ +#define cond_resched_rwlock_write(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_write(lock); \ }) static inline void cond_resched_rcu(void) -- cgit v1.2.3 From f792565326825ed806626da50c6f9a928f1079c1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 29 Sep 2021 12:43:13 -0700 Subject: perf/core: fix userpage->time_enabled of inactive events Users of rdpmc rely on the mmapped user page to calculate accurate time_enabled. Currently, userpage->time_enabled is only updated when the event is added to the pmu. As a result, inactive event (due to counter multiplexing) does not have accurate userpage->time_enabled. This can be reproduced with something like: /* open 20 task perf_event "cycles", to create multiplexing */ fd = perf_event_open(); /* open task perf_event "cycles" */ userpage = mmap(fd); /* use mmap and rdmpc */ while (true) { time_enabled_mmap = xxx; /* use logic in perf_event_mmap_page */ time_enabled_read = read(fd).time_enabled; if (time_enabled_mmap > time_enabled_read) BUG(); } Fix this by updating userpage for inactive events in merge_sched_in. Suggested-by: Peter Zijlstra (Intel) Reported-and-tested-by: Lucian Grijincu Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210929194313.2398474-1-songliubraving@fb.com --- include/linux/perf_event.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fe156a8170aa..9b60bb89d86a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -683,7 +683,9 @@ struct perf_event { /* * timestamp shadows the actual context timing but it can * be safely used in NMI interrupt context. It reflects the - * context time as it was when the event was last scheduled in. + * context time as it was when the event was last scheduled in, + * or when ctx_sched_in failed to schedule the event because we + * run out of PMC. * * ctx_time already accounts for ctx->timestamp. Therefore to * compute ctx_time for a sample, simply add perf_clock(). -- cgit v1.2.3 From 83d40a61046f73103b4e5d8f1310261487ff63b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Sep 2021 15:31:11 +0200 Subject: sched: Always inline is_percpu_thread() vmlinux.o: warning: objtool: check_preemption_disabled()+0x81: call to is_percpu_thread() leaves .noinstr.text section Reported-by: Stephen Rothwell Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210928084218.063371959@infradead.org --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 39039ce8ac4c..c1a927ddec64 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1720,7 +1720,7 @@ extern struct pid *cad_pid; #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) -static inline bool is_percpu_thread(void) +static __always_inline bool is_percpu_thread(void) { #ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && -- cgit v1.2.3 From 74e78adc6ccf6c3b53939788cf0c49f54db70731 Mon Sep 17 00:00:00 2001 From: Sai Krishna Potthuri Date: Fri, 24 Sep 2021 15:37:08 +0530 Subject: firmware: xilinx: Add OSPI Mux selection support Add OSPI Mux selection API support to select the AXI interface to OSPI. Signed-off-by: Sai Krishna Potthuri Link: https://lore.kernel.org/r/1632478031-12242-2-git-send-email-lakshmi.sai.krishna.potthuri@xilinx.com Signed-off-by: Mark Brown --- include/linux/firmware/xlnx-zynqmp.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 56b426fe020c..4c70a6e2141e 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -123,6 +123,7 @@ enum pm_ioctl_id { IOCTL_READ_PGGS = 15, /* Set healthy bit value */ IOCTL_SET_BOOT_HEALTH_STATUS = 17, + IOCTL_OSPI_MUX_SELECT = 21, }; enum pm_query_id { @@ -351,6 +352,11 @@ enum zynqmp_pm_shutdown_subtype { ZYNQMP_PM_SHUTDOWN_SUBTYPE_SYSTEM = 2, }; +enum ospi_mux_select_type { + PM_OSPI_MUX_SEL_DMA = 0, + PM_OSPI_MUX_SEL_LINEAR = 1, +}; + /** * struct zynqmp_pm_query_data - PM query data * @qid: query ID @@ -387,6 +393,7 @@ int zynqmp_pm_set_pll_frac_data(u32 clk_id, u32 data); int zynqmp_pm_get_pll_frac_data(u32 clk_id, u32 *data); int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value); int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type); +int zynqmp_pm_ospi_mux_select(u32 dev_id, u32 select); int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, const enum zynqmp_pm_reset_action assert_flag); int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset, u32 *status); @@ -508,6 +515,11 @@ static inline int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type) return -ENODEV; } +static inline int zynqmp_pm_ospi_mux_select(u32 dev_id, u32 select) +{ + return -ENODEV; +} + static inline int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset, const enum zynqmp_pm_reset_action assert_flag) { -- cgit v1.2.3 From 42f355ef59a2f98fa4affb4265d3ba3e2d86baf1 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 19 May 2021 16:00:20 -0400 Subject: audit: replace magic audit syscall class numbers with macros Replace audit syscall class magic numbers with macros. This required putting the macros into new header file include/linux/audit_arch.h since the syscall macros were included for both 64 bit and 32 bit in any compat code, causing redefinition warnings. Link: https://lore.kernel.org/r/2300b1083a32aade7ae7efb95826e8f3f260b1df.1621363275.git.rgb@redhat.com Signed-off-by: Richard Guy Briggs Acked-by: Christian Brauner [PM: renamed header to audit_arch.h after consulting with Richard] Signed-off-by: Paul Moore --- include/linux/audit.h | 1 + include/linux/audit_arch.h | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 include/linux/audit_arch.h (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 82b7c1116a85..5fbeeeb6b726 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -11,6 +11,7 @@ #include #include +#include #include #include diff --git a/include/linux/audit_arch.h b/include/linux/audit_arch.h new file mode 100644 index 000000000000..d4a506faabb0 --- /dev/null +++ b/include/linux/audit_arch.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* audit_arch.h -- Arch layer specific support for audit + * + * Copyright 2021 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * Author: Richard Guy Briggs + */ +#ifndef _LINUX_AUDIT_ARCH_H_ +#define _LINUX_AUDIT_ARCH_H_ + +enum auditsc_class_t { + AUDITSC_NATIVE = 0, + AUDITSC_COMPAT, + AUDITSC_OPEN, + AUDITSC_OPENAT, + AUDITSC_SOCKETCALL, + AUDITSC_EXECVE, + + AUDITSC_NVALS /* count */ +}; + +#endif -- cgit v1.2.3 From 1c30e3af8a79260cdba833a719209b01e6b92300 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 19 May 2021 16:00:21 -0400 Subject: audit: add support for the openat2 syscall The openat2(2) syscall was added in kernel v5.6 with commit fddb5d430ad9 ("open: introduce openat2(2) syscall"). Add the openat2(2) syscall to the audit syscall classifier. Link: https://github.com/linux-audit/audit-kernel/issues/67 Link: https://lore.kernel.org/r/f5f1a4d8699613f8c02ce762807228c841c2e26f.1621363275.git.rgb@redhat.com Signed-off-by: Richard Guy Briggs Acked-by: Christian Brauner [PM: merge fuzz due to previous header rename, commit line wraps] Signed-off-by: Paul Moore --- include/linux/audit_arch.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/audit_arch.h b/include/linux/audit_arch.h index d4a506faabb0..8fdb1afe251a 100644 --- a/include/linux/audit_arch.h +++ b/include/linux/audit_arch.h @@ -16,6 +16,7 @@ enum auditsc_class_t { AUDITSC_OPENAT, AUDITSC_SOCKETCALL, AUDITSC_EXECVE, + AUDITSC_OPENAT2, AUDITSC_NVALS /* count */ }; -- cgit v1.2.3 From e8c0722927e8fc112d78cc0435d336fcd7e3507c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 1 Oct 2021 18:15:27 +0300 Subject: net: mscc: ocelot: write full VLAN TCI in the injection header The VLAN TCI contains more than the VLAN ID, it also has the VLAN PCP and Drop Eligibility Indicator. If the ocelot driver is going to write the VLAN header inside the DSA tag, it could just as well write the entire TCI. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/ocelot.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index 435777a0073c..0fe101e8e190 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -210,9 +210,9 @@ static inline void ocelot_ifh_set_tag_type(void *injection, u64 tag_type) packing(injection, &tag_type, 16, 16, OCELOT_TAG_LEN, PACK, 0); } -static inline void ocelot_ifh_set_vid(void *injection, u64 vid) +static inline void ocelot_ifh_set_vlan_tci(void *injection, u64 vlan_tci) { - packing(injection, &vid, 11, 0, OCELOT_TAG_LEN, PACK, 0); + packing(injection, &vlan_tci, 15, 0, OCELOT_TAG_LEN, PACK, 0); } #endif -- cgit v1.2.3 From f3956ebb3bf06ab2266ad5ee2214aed46405810c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 Oct 2021 14:32:23 -0700 Subject: ethernet: use eth_hw_addr_set() instead of ether_addr_copy() Convert Ethernet from ether_addr_copy() to eth_hw_addr_set(): @@ expression dev, np; @@ - ether_addr_copy(dev->dev_addr, np) + eth_hw_addr_set(dev, np) Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 928c411bd509..e7b2e5fd8d24 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -323,7 +323,7 @@ static inline void eth_hw_addr_inherit(struct net_device *dst, struct net_device *src) { dst->addr_assign_type = src->addr_assign_type; - ether_addr_copy(dst->dev_addr, src->dev_addr); + eth_hw_addr_set(dst, src->dev_addr); } /** -- cgit v1.2.3 From d0c27c9211fef3ce8083cc3ad2ca3067d211edc9 Mon Sep 17 00:00:00 2001 From: Henrik Grimler Date: Wed, 29 Sep 2021 20:14:18 +0200 Subject: power: supply: max17042_battery: fix typo in MAX17042_IAvg_empty Datasheet gives the name IAvg_empty, not LAvg_empty. Signed-off-by: Henrik Grimler Reviewed-by: Hans de Goede Reviewed-by: Krzysztof Kozlowski Signed-off-by: Sebastian Reichel --- include/linux/power/max17042_battery.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index dd24756a8af7..c417abd2ab70 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -78,7 +78,7 @@ enum max17042_register { MAX17042_T_empty = 0x34, MAX17042_FullCAP0 = 0x35, - MAX17042_LAvg_empty = 0x36, + MAX17042_IAvg_empty = 0x36, MAX17042_FCTC = 0x37, MAX17042_RCOMP0 = 0x38, MAX17042_TempCo = 0x39, @@ -221,7 +221,7 @@ struct max17042_config_data { u16 fullcap; /* 0x10 */ u16 fullcapnom; /* 0x23 */ u16 socempty; /* 0x33 */ - u16 lavg_empty; /* 0x36 */ + u16 iavg_empty; /* 0x36 */ u16 dqacc; /* 0x45 */ u16 dpacc; /* 0x46 */ u16 qrtbl00; /* 0x12 */ -- cgit v1.2.3 From dae9a6cab8009e526570e7477ce858dcdfeb256e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 30 Sep 2021 17:06:21 -0400 Subject: NFSD: Have legacy NFSD WRITE decoders use xdr_stream_subsegment() Refactor. Now that the NFSv2 and NFSv3 XDR decoders have been converted to use xdr_streams, the WRITE decoder functions can use xdr_stream_subsegment() to extract the WRITE payload into its own xdr_buf, just as the NFSv4 WRITE XDR decoder currently does. That makes it possible to pass the first kvec, pages array + length, page_base, and total payload length via a single function parameter. The payload's page_base is not yet assigned or used, but will be in subsequent patches. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 064c96157d1f..6263410c948a 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -532,8 +532,7 @@ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, - struct page **pages, - struct kvec *first, size_t total); + struct xdr_buf *payload); char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, void *p, size_t total); -- cgit v1.2.3 From ca05cbae2a0468e5d78e9b4605936a8bf5da328b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 10 Jul 2021 18:07:14 -0400 Subject: NFS: Fix up nfs_ctx_key_to_expire() If the cached credential exists but doesn't have any expiration callback then exit early. Fix up atomicity issues when replacing the credential with a new one since the existing code could lead to refcount leaks. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b9a8b925db43..9b75448ce0df 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -81,7 +81,7 @@ struct nfs_open_context { fl_owner_t flock_owner; struct dentry *dentry; const struct cred *cred; - struct rpc_cred *ll_cred; /* low-level cred - use to check for expiry */ + struct rpc_cred __rcu *ll_cred; /* low-level cred - use to check for expiry */ struct nfs4_state *state; fmode_t mode; -- cgit v1.2.3 From b9f8713f42af11ae6d7f63075334ba5298436be6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 12 Jul 2021 12:24:15 -0400 Subject: SUNRPC: Remove unnecessary memory barriers The only check for RPC_TASK_RUNNING is the one in rpc_make_runnable(), which happens under the same spin lock held when we call rpc_clear_running(). Ditto, the last check for RPC_TASK_QUEUED in rpc_execute() is performed under the same lock as the one held when we call rpc_clear_queued(). Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index a237b8dbf608..db964bb63912 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -150,25 +150,13 @@ struct rpc_task_setup { #define RPC_TASK_MSG_PIN_WAIT 5 #define RPC_TASK_SIGNALLED 6 -#define RPC_IS_RUNNING(t) test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) -#define rpc_set_running(t) set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) #define rpc_test_and_set_running(t) \ test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) -#define rpc_clear_running(t) \ - do { \ - smp_mb__before_atomic(); \ - clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \ - smp_mb__after_atomic(); \ - } while (0) +#define rpc_clear_running(t) clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) #define RPC_IS_QUEUED(t) test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate) #define rpc_set_queued(t) set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate) -#define rpc_clear_queued(t) \ - do { \ - smp_mb__before_atomic(); \ - clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate); \ - smp_mb__after_atomic(); \ - } while (0) +#define rpc_clear_queued(t) clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate) #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate) -- cgit v1.2.3 From ff81dfb5d721fff87bd516c558847f6effb70031 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 28 Sep 2021 14:33:44 -0400 Subject: NFS: Further optimisations for 'ls -l' If a user is doing 'ls -l', we have a heuristic in GETATTR that tells the readdir code to try to use READDIRPLUS in order to refresh the inode attributes. In certain cirumstances, we also try to invalidate the remaining directory entries in order to ensure this refresh. If there are multiple readers of the directory, we probably should avoid invalidating the page cache, since the heuristic breaks down in that situation anyway. Signed-off-by: Trond Myklebust Tested-by: Benjamin Coddington Reviewed-by: Benjamin Coddington --- include/linux/nfs_fs.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 9b75448ce0df..ca547cc5458c 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -103,6 +103,7 @@ struct nfs_open_dir_context { __be32 verf[NFS_DIR_VERIFIER_SIZE]; __u64 dir_cookie; __u64 dup_cookie; + pgoff_t page_index; signed char duped; }; @@ -181,9 +182,6 @@ struct nfs_inode { struct rw_semaphore rmdir_sem; struct mutex commit_mutex; - /* track last access to cached pages */ - unsigned long page_index; - #if IS_ENABLED(CONFIG_NFS_V4) struct nfs4_cached_acl *nfs4_acl; /* NFSv4 state */ @@ -272,6 +270,7 @@ struct nfs4_copy_state { #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ +#define NFS_INO_FORCE_READDIR (7) /* force readdirplus */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ -- cgit v1.2.3 From 3734b9f2cee01d9dde2fbbda742ba6dd6ba10a29 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 27 Sep 2021 01:40:24 +0300 Subject: opp: Change type of dev_pm_opp_attach_genpd(names) argument Elements of the 'names' array are not changed by the code, constify them for consistency. Signed-off-by: Dmitry Osipenko Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 84150a22fd7c..7f1f066fc377 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -156,9 +156,9 @@ int devm_pm_opp_set_clkname(struct device *dev, const char *name); struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data)); void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table); int devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data)); -struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs); +struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs); void dev_pm_opp_detach_genpd(struct opp_table *opp_table); -int devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs); +int devm_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs); struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, struct opp_table *dst_table, struct dev_pm_opp *src_opp); int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate); int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq); @@ -376,7 +376,7 @@ static inline int devm_pm_opp_set_clkname(struct device *dev, const char *name) return -EOPNOTSUPP; } -static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs) +static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char * const *names, struct device ***virt_devs) { return ERR_PTR(-EOPNOTSUPP); } @@ -384,7 +384,7 @@ static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, cons static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {} static inline int devm_pm_opp_attach_genpd(struct device *dev, - const char **names, + const char * const *names, struct device ***virt_devs) { return -EOPNOTSUPP; -- cgit v1.2.3 From 46b49b12f3fc5e1347dba37d4639e2165f447871 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 8 Sep 2021 17:58:33 -0500 Subject: arch/cc: Introduce a function to check for confidential computing features In preparation for other confidential computing technologies, introduce a generic helper function, cc_platform_has(), that can be used to check for specific active confidential computing attributes, like memory encryption. This is intended to eliminate having to add multiple technology-specific checks to the code (e.g. if (sev_active() || tdx_active() || ... ). [ bp: s/_CC_PLATFORM_H/_LINUX_CC_PLATFORM_H/g ] Co-developed-by: Andi Kleen Signed-off-by: Andi Kleen Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210928191009.32551-3-bp@alien8.de --- include/linux/cc_platform.h | 88 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 include/linux/cc_platform.h (limited to 'include/linux') diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h new file mode 100644 index 000000000000..a075b70b9a70 --- /dev/null +++ b/include/linux/cc_platform.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Confidential Computing Platform Capability checks + * + * Copyright (C) 2021 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + */ + +#ifndef _LINUX_CC_PLATFORM_H +#define _LINUX_CC_PLATFORM_H + +#include +#include + +/** + * enum cc_attr - Confidential computing attributes + * + * These attributes represent confidential computing features that are + * currently active. + */ +enum cc_attr { + /** + * @CC_ATTR_MEM_ENCRYPT: Memory encryption is active + * + * The platform/OS is running with active memory encryption. This + * includes running either as a bare-metal system or a hypervisor + * and actively using memory encryption or as a guest/virtual machine + * and actively using memory encryption. + * + * Examples include SME, SEV and SEV-ES. + */ + CC_ATTR_MEM_ENCRYPT, + + /** + * @CC_ATTR_HOST_MEM_ENCRYPT: Host memory encryption is active + * + * The platform/OS is running as a bare-metal system or a hypervisor + * and actively using memory encryption. + * + * Examples include SME. + */ + CC_ATTR_HOST_MEM_ENCRYPT, + + /** + * @CC_ATTR_GUEST_MEM_ENCRYPT: Guest memory encryption is active + * + * The platform/OS is running as a guest/virtual machine and actively + * using memory encryption. + * + * Examples include SEV and SEV-ES. + */ + CC_ATTR_GUEST_MEM_ENCRYPT, + + /** + * @CC_ATTR_GUEST_STATE_ENCRYPT: Guest state encryption is active + * + * The platform/OS is running as a guest/virtual machine and actively + * using memory encryption and register state encryption. + * + * Examples include SEV-ES. + */ + CC_ATTR_GUEST_STATE_ENCRYPT, +}; + +#ifdef CONFIG_ARCH_HAS_CC_PLATFORM + +/** + * cc_platform_has() - Checks if the specified cc_attr attribute is active + * @attr: Confidential computing attribute to check + * + * The cc_platform_has() function will return an indicator as to whether the + * specified Confidential Computing attribute is currently active. + * + * Context: Any context + * Return: + * * TRUE - Specified Confidential Computing attribute is active + * * FALSE - Specified Confidential Computing attribute is not active + */ +bool cc_platform_has(enum cc_attr attr); + +#else /* !CONFIG_ARCH_HAS_CC_PLATFORM */ + +static inline bool cc_platform_has(enum cc_attr attr) { return false; } + +#endif /* CONFIG_ARCH_HAS_CC_PLATFORM */ + +#endif /* _LINUX_CC_PLATFORM_H */ -- cgit v1.2.3 From e9d1d2bb75b2d5d4b426769c5aae0ce8cef3558f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 8 Sep 2021 17:58:39 -0500 Subject: treewide: Replace the use of mem_encrypt_active() with cc_platform_has() Replace uses of mem_encrypt_active() with calls to cc_platform_has() with the CC_ATTR_MEM_ENCRYPT attribute. Remove the implementation of mem_encrypt_active() across all arches. For s390, since the default implementation of the cc_platform_has() matches the s390 implementation of mem_encrypt_active(), cc_platform_has() does not need to be implemented in s390 (the config option ARCH_HAS_CC_PLATFORM is not set). Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210928191009.32551-9-bp@alien8.de --- include/linux/mem_encrypt.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index 5c4a18a91f89..ae4526389261 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -16,10 +16,6 @@ #include -#else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */ - -static inline bool mem_encrypt_active(void) { return false; } - #endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ #ifdef CONFIG_AMD_MEM_ENCRYPT -- cgit v1.2.3 From e69709f6861aaba80b4eaab6825e8c522a2afb5c Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 20 Sep 2021 20:22:46 +0300 Subject: opp: Add more resource-managed variants of dev_pm_opp_of_add_table() Add resource-managed variants of dev_pm_opp_of_add_table_indexed() and dev_pm_opp_of_add_table_noclk(), allowing drivers to remove boilerplate code. Signed-off-by: Dmitry Osipenko [ Viresh: Added underscore to devm_of_add_table_indexed() ] Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 7f1f066fc377..879c138c7b8e 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -439,7 +439,9 @@ static inline int dev_pm_opp_sync_regulators(struct device *dev) #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF) int dev_pm_opp_of_add_table(struct device *dev); int dev_pm_opp_of_add_table_indexed(struct device *dev, int index); +int devm_pm_opp_of_add_table_indexed(struct device *dev, int index); int dev_pm_opp_of_add_table_noclk(struct device *dev, int index); +int devm_pm_opp_of_add_table_noclk(struct device *dev, int index); void dev_pm_opp_of_remove_table(struct device *dev); int devm_pm_opp_of_add_table(struct device *dev); int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask); @@ -465,11 +467,21 @@ static inline int dev_pm_opp_of_add_table_indexed(struct device *dev, int index) return -EOPNOTSUPP; } +static inline int devm_pm_opp_of_add_table_indexed(struct device *dev, int index) +{ + return -EOPNOTSUPP; +} + static inline int dev_pm_opp_of_add_table_noclk(struct device *dev, int index) { return -EOPNOTSUPP; } +static inline int devm_pm_opp_of_add_table_noclk(struct device *dev, int index) +{ + return -EOPNOTSUPP; +} + static inline void dev_pm_opp_of_remove_table(struct device *dev) { } -- cgit v1.2.3 From 19198e4ec97dc9d173b458a75ace3c3ca55c2d85 Mon Sep 17 00:00:00 2001 From: Prabhakar Kushwaha Date: Mon, 4 Oct 2021 09:58:39 +0300 Subject: qed: Fix kernel-doc warnings This patch fixes all the qed and qede kernel-doc warnings according to the guidelines that are described in Documentation/doc-guide/kernel-doc.rst. Signed-off-by: Ariel Elior Signed-off-by: Omkar Kulkarni Signed-off-by: Shai Malin Signed-off-by: Prabhakar Kushwaha Signed-off-by: David S. Miller --- include/linux/qed/qed_chain.h | 97 +++++++------- include/linux/qed/qed_if.h | 255 ++++++++++++++++++++----------------- include/linux/qed/qed_iscsi_if.h | 2 +- include/linux/qed/qed_ll2_if.h | 42 +++--- include/linux/qed/qed_nvmetcp_if.h | 17 +++ 5 files changed, 230 insertions(+), 183 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index f34dbd0db795..a84063492c71 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -268,14 +268,15 @@ static inline dma_addr_t qed_chain_get_pbl_phys(const struct qed_chain *chain) } /** - * @brief qed_chain_advance_page - + * qed_chain_advance_page(): Advance the next element across pages for a + * linked chain. * - * Advance the next element across pages for a linked chain + * @p_chain: P_chain. + * @p_next_elem: P_next_elem. + * @idx_to_inc: Idx_to_inc. + * @page_to_inc: page_to_inc. * - * @param p_chain - * @param p_next_elem - * @param idx_to_inc - * @param page_to_inc + * Return: Void. */ static inline void qed_chain_advance_page(struct qed_chain *p_chain, @@ -336,12 +337,14 @@ qed_chain_advance_page(struct qed_chain *p_chain, } while (0) /** - * @brief qed_chain_return_produced - + * qed_chain_return_produced(): A chain in which the driver "Produces" + * elements should use this API + * to indicate previous produced elements + * are now consumed. * - * A chain in which the driver "Produces" elements should use this API - * to indicate previous produced elements are now consumed. + * @p_chain: Chain. * - * @param p_chain + * Return: Void. */ static inline void qed_chain_return_produced(struct qed_chain *p_chain) { @@ -353,15 +356,15 @@ static inline void qed_chain_return_produced(struct qed_chain *p_chain) } /** - * @brief qed_chain_produce - + * qed_chain_produce(): A chain in which the driver "Produces" + * elements should use this to get a pointer to + * the next element which can be "Produced". It's driver + * responsibility to validate that the chain has room for + * new element. * - * A chain in which the driver "Produces" elements should use this to get - * a pointer to the next element which can be "Produced". It's driver - * responsibility to validate that the chain has room for new element. + * @p_chain: Chain. * - * @param p_chain - * - * @return void*, a pointer to next element + * Return: void*, a pointer to next element. */ static inline void *qed_chain_produce(struct qed_chain *p_chain) { @@ -395,14 +398,11 @@ static inline void *qed_chain_produce(struct qed_chain *p_chain) } /** - * @brief qed_chain_get_capacity - - * - * Get the maximum number of BDs in chain + * qed_chain_get_capacity(): Get the maximum number of BDs in chain * - * @param p_chain - * @param num + * @p_chain: Chain. * - * @return number of unusable BDs + * Return: number of unusable BDs. */ static inline u32 qed_chain_get_capacity(struct qed_chain *p_chain) { @@ -410,12 +410,14 @@ static inline u32 qed_chain_get_capacity(struct qed_chain *p_chain) } /** - * @brief qed_chain_recycle_consumed - + * qed_chain_recycle_consumed(): Returns an element which was + * previously consumed; + * Increments producers so they could + * be written to FW. * - * Returns an element which was previously consumed; - * Increments producers so they could be written to FW. + * @p_chain: Chain. * - * @param p_chain + * Return: Void. */ static inline void qed_chain_recycle_consumed(struct qed_chain *p_chain) { @@ -427,14 +429,13 @@ static inline void qed_chain_recycle_consumed(struct qed_chain *p_chain) } /** - * @brief qed_chain_consume - + * qed_chain_consume(): A Chain in which the driver utilizes data written + * by a different source (i.e., FW) should use this to + * access passed buffers. * - * A Chain in which the driver utilizes data written by a different source - * (i.e., FW) should use this to access passed buffers. + * @p_chain: Chain. * - * @param p_chain - * - * @return void*, a pointer to the next buffer written + * Return: void*, a pointer to the next buffer written. */ static inline void *qed_chain_consume(struct qed_chain *p_chain) { @@ -468,9 +469,11 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain) } /** - * @brief qed_chain_reset - Resets the chain to its start state + * qed_chain_reset(): Resets the chain to its start state. + * + * @p_chain: pointer to a previously allocated chain. * - * @param p_chain pointer to a previously allocated chain + * Return Void. */ static inline void qed_chain_reset(struct qed_chain *p_chain) { @@ -519,13 +522,12 @@ static inline void qed_chain_reset(struct qed_chain *p_chain) } /** - * @brief qed_chain_get_last_elem - + * qed_chain_get_last_elem(): Returns a pointer to the last element of the + * chain. * - * Returns a pointer to the last element of the chain + * @p_chain: Chain. * - * @param p_chain - * - * @return void* + * Return: void*. */ static inline void *qed_chain_get_last_elem(struct qed_chain *p_chain) { @@ -563,10 +565,13 @@ out: } /** - * @brief qed_chain_set_prod - sets the prod to the given value + * qed_chain_set_prod(): sets the prod to the given value. + * + * @p_chain: Chain. + * @prod_idx: Prod Idx. + * @p_prod_elem: Prod elem. * - * @param prod_idx - * @param p_prod_elem + * Return Void. */ static inline void qed_chain_set_prod(struct qed_chain *p_chain, u32 prod_idx, void *p_prod_elem) @@ -610,9 +615,11 @@ static inline void qed_chain_set_prod(struct qed_chain *p_chain, } /** - * @brief qed_chain_pbl_zero_mem - set chain memory to 0 + * qed_chain_pbl_zero_mem(): set chain memory to 0. + * + * @p_chain: Chain. * - * @param p_chain + * Return: Void. */ static inline void qed_chain_pbl_zero_mem(struct qed_chain *p_chain) { diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 850b98991670..f39451aaaeec 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -819,47 +819,47 @@ struct qed_common_cb_ops { struct qed_selftest_ops { /** - * @brief selftest_interrupt - Perform interrupt test + * selftest_interrupt(): Perform interrupt test. * - * @param cdev + * @cdev: Qed dev pointer. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*selftest_interrupt)(struct qed_dev *cdev); /** - * @brief selftest_memory - Perform memory test + * selftest_memory(): Perform memory test. * - * @param cdev + * @cdev: Qed dev pointer. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*selftest_memory)(struct qed_dev *cdev); /** - * @brief selftest_register - Perform register test + * selftest_register(): Perform register test. * - * @param cdev + * @cdev: Qed dev pointer. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*selftest_register)(struct qed_dev *cdev); /** - * @brief selftest_clock - Perform clock test + * selftest_clock(): Perform clock test. * - * @param cdev + * @cdev: Qed dev pointer. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*selftest_clock)(struct qed_dev *cdev); /** - * @brief selftest_nvram - Perform nvram test + * selftest_nvram(): Perform nvram test. * - * @param cdev + * @cdev: Qed dev pointer. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*selftest_nvram) (struct qed_dev *cdev); }; @@ -927,47 +927,53 @@ struct qed_common_ops { enum qed_hw_err_type err_type); /** - * @brief can_link_change - can the instance change the link or not + * can_link_change(): can the instance change the link or not. * - * @param cdev + * @cdev: Qed dev pointer. * - * @return true if link-change is allowed, false otherwise. + * Return: true if link-change is allowed, false otherwise. */ bool (*can_link_change)(struct qed_dev *cdev); /** - * @brief set_link - set links according to params + * set_link(): set links according to params. * - * @param cdev - * @param params - values used to override the default link configuration + * @cdev: Qed dev pointer. + * @params: values used to override the default link configuration. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*set_link)(struct qed_dev *cdev, struct qed_link_params *params); /** - * @brief get_link - returns the current link state. + * get_link(): returns the current link state. * - * @param cdev - * @param if_link - structure to be filled with current link configuration. + * @cdev: Qed dev pointer. + * @if_link: structure to be filled with current link configuration. + * + * Return: Void. */ void (*get_link)(struct qed_dev *cdev, struct qed_link_output *if_link); /** - * @brief - drains chip in case Tx completions fail to arrive due to pause. + * drain(): drains chip in case Tx completions fail to arrive due to pause. + * + * @cdev: Qed dev pointer. * - * @param cdev + * Return: Int. */ int (*drain)(struct qed_dev *cdev); /** - * @brief update_msglvl - update module debug level + * update_msglvl(): update module debug level. * - * @param cdev - * @param dp_module - * @param dp_level + * @cdev: Qed dev pointer. + * @dp_module: Debug module. + * @dp_level: Debug level. + * + * Return: Void. */ void (*update_msglvl)(struct qed_dev *cdev, u32 dp_module, @@ -981,70 +987,73 @@ struct qed_common_ops { struct qed_chain *p_chain); /** - * @brief nvm_flash - Flash nvm data. + * nvm_flash(): Flash nvm data. * - * @param cdev - * @param name - file containing the data + * @cdev: Qed dev pointer. + * @name: file containing the data. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*nvm_flash)(struct qed_dev *cdev, const char *name); /** - * @brief nvm_get_image - reads an entire image from nvram + * nvm_get_image(): reads an entire image from nvram. * - * @param cdev - * @param type - type of the request nvram image - * @param buf - preallocated buffer to fill with the image - * @param len - length of the allocated buffer + * @cdev: Qed dev pointer. + * @type: type of the request nvram image. + * @buf: preallocated buffer to fill with the image. + * @len: length of the allocated buffer. * - * @return 0 on success, error otherwise + * Return: 0 on success, error otherwise. */ int (*nvm_get_image)(struct qed_dev *cdev, enum qed_nvm_images type, u8 *buf, u16 len); /** - * @brief set_coalesce - Configure Rx coalesce value in usec + * set_coalesce(): Configure Rx coalesce value in usec. * - * @param cdev - * @param rx_coal - Rx coalesce value in usec - * @param tx_coal - Tx coalesce value in usec - * @param qid - Queue index - * @param sb_id - Status Block Id + * @cdev: Qed dev pointer. + * @rx_coal: Rx coalesce value in usec. + * @tx_coal: Tx coalesce value in usec. + * @handle: Handle. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*set_coalesce)(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal, void *handle); /** - * @brief set_led - Configure LED mode + * set_led() - Configure LED mode. * - * @param cdev - * @param mode - LED mode + * @cdev: Qed dev pointer. + * @mode: LED mode. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*set_led)(struct qed_dev *cdev, enum qed_led_mode mode); /** - * @brief attn_clr_enable - Prevent attentions from being reasserted + * attn_clr_enable(): Prevent attentions from being reasserted. + * + * @cdev: Qed dev pointer. + * @clr_enable: Clear enable. * - * @param cdev - * @param clr_enable + * Return: Void. */ void (*attn_clr_enable)(struct qed_dev *cdev, bool clr_enable); /** - * @brief db_recovery_add - add doorbell information to the doorbell - * recovery mechanism. + * db_recovery_add(): add doorbell information to the doorbell + * recovery mechanism. * - * @param cdev - * @param db_addr - doorbell address - * @param db_data - address of where db_data is stored - * @param db_is_32b - doorbell is 32b pr 64b - * @param db_is_user - doorbell recovery addresses are user or kernel space + * @cdev: Qed dev pointer. + * @db_addr: Doorbell address. + * @db_data: Dddress of where db_data is stored. + * @db_width: Doorbell is 32b or 64b. + * @db_space: Doorbell recovery addresses are user or kernel space. + * + * Return: Int. */ int (*db_recovery_add)(struct qed_dev *cdev, void __iomem *db_addr, @@ -1053,114 +1062,130 @@ struct qed_common_ops { enum qed_db_rec_space db_space); /** - * @brief db_recovery_del - remove doorbell information from the doorbell + * db_recovery_del(): remove doorbell information from the doorbell * recovery mechanism. db_data serves as key (db_addr is not unique). * - * @param cdev - * @param db_addr - doorbell address - * @param db_data - address where db_data is stored. Serves as key for the - * entry to delete. + * @cdev: Qed dev pointer. + * @db_addr: Doorbell address. + * @db_data: Address where db_data is stored. Serves as key for the + * entry to delete. + * + * Return: Int. */ int (*db_recovery_del)(struct qed_dev *cdev, void __iomem *db_addr, void *db_data); /** - * @brief recovery_process - Trigger a recovery process + * recovery_process(): Trigger a recovery process. * - * @param cdev + * @cdev: Qed dev pointer. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*recovery_process)(struct qed_dev *cdev); /** - * @brief recovery_prolog - Execute the prolog operations of a recovery process + * recovery_prolog(): Execute the prolog operations of a recovery process. * - * @param cdev + * @cdev: Qed dev pointer. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*recovery_prolog)(struct qed_dev *cdev); /** - * @brief update_drv_state - API to inform the change in the driver state. + * update_drv_state(): API to inform the change in the driver state. * - * @param cdev - * @param active + * @cdev: Qed dev pointer. + * @active: Active * + * Return: Int. */ int (*update_drv_state)(struct qed_dev *cdev, bool active); /** - * @brief update_mac - API to inform the change in the mac address + * update_mac(): API to inform the change in the mac address. * - * @param cdev - * @param mac + * @cdev: Qed dev pointer. + * @mac: MAC. * + * Return: Int. */ int (*update_mac)(struct qed_dev *cdev, u8 *mac); /** - * @brief update_mtu - API to inform the change in the mtu + * update_mtu(): API to inform the change in the mtu. * - * @param cdev - * @param mtu + * @cdev: Qed dev pointer. + * @mtu: MTU. * + * Return: Int. */ int (*update_mtu)(struct qed_dev *cdev, u16 mtu); /** - * @brief update_wol - update of changes in the WoL configuration + * update_wol(): Update of changes in the WoL configuration. + * + * @cdev: Qed dev pointer. + * @enabled: true iff WoL should be enabled. * - * @param cdev - * @param enabled - true iff WoL should be enabled. + * Return: Int. */ int (*update_wol) (struct qed_dev *cdev, bool enabled); /** - * @brief read_module_eeprom + * read_module_eeprom(): Read EEPROM. * - * @param cdev - * @param buf - buffer - * @param dev_addr - PHY device memory region - * @param offset - offset into eeprom contents to be read - * @param len - buffer length, i.e., max bytes to be read + * @cdev: Qed dev pointer. + * @buf: buffer. + * @dev_addr: PHY device memory region. + * @offset: offset into eeprom contents to be read. + * @len: buffer length, i.e., max bytes to be read. + * + * Return: Int. */ int (*read_module_eeprom)(struct qed_dev *cdev, char *buf, u8 dev_addr, u32 offset, u32 len); /** - * @brief get_affin_hwfn_idx + * get_affin_hwfn_idx(): Get affine HW function. + * + * @cdev: Qed dev pointer. * - * @param cdev + * Return: u8. */ u8 (*get_affin_hwfn_idx)(struct qed_dev *cdev); /** - * @brief read_nvm_cfg - Read NVM config attribute value. - * @param cdev - * @param buf - buffer - * @param cmd - NVM CFG command id - * @param entity_id - Entity id + * read_nvm_cfg(): Read NVM config attribute value. + * + * @cdev: Qed dev pointer. + * @buf: Buffer. + * @cmd: NVM CFG command id. + * @entity_id: Entity id. * + * Return: Int. */ int (*read_nvm_cfg)(struct qed_dev *cdev, u8 **buf, u32 cmd, u32 entity_id); /** - * @brief read_nvm_cfg - Read NVM config attribute value. - * @param cdev - * @param cmd - NVM CFG command id + * read_nvm_cfg_len(): Read NVM config attribute value. * - * @return config id length, 0 on error. + * @cdev: Qed dev pointer. + * @cmd: NVM CFG command id. + * + * Return: config id length, 0 on error. */ int (*read_nvm_cfg_len)(struct qed_dev *cdev, u32 cmd); /** - * @brief set_grc_config - Configure value for grc config id. - * @param cdev - * @param cfg_id - grc config id - * @param val - grc config value + * set_grc_config(): Configure value for grc config id. + * + * @cdev: Qed dev pointer. + * @cfg_id: grc config id + * @val: grc config value * + * Return: Int. */ int (*set_grc_config)(struct qed_dev *cdev, u32 cfg_id, u32 val); @@ -1397,18 +1422,16 @@ static inline u16 qed_sb_update_sb_idx(struct qed_sb_info *sb_info) } /** + * qed_sb_ack(): This function creates an update command for interrupts + * that is written to the IGU. * - * @brief This function creates an update command for interrupts that is - * written to the IGU. - * - * @param sb_info - This is the structure allocated and - * initialized per status block. Assumption is - * that it was initialized using qed_sb_init - * @param int_cmd - Enable/Disable/Nop - * @param upd_flg - whether igu consumer should be - * updated. + * @sb_info: This is the structure allocated and + * initialized per status block. Assumption is + * that it was initialized using qed_sb_init + * @int_cmd: Enable/Disable/Nop + * @upd_flg: Whether igu consumer should be updated. * - * @return inline void + * Return: inline void. */ static inline void qed_sb_ack(struct qed_sb_info *sb_info, enum igu_int_cmd int_cmd, diff --git a/include/linux/qed/qed_iscsi_if.h b/include/linux/qed/qed_iscsi_if.h index 04180d9af560..494cdc3cd840 100644 --- a/include/linux/qed/qed_iscsi_if.h +++ b/include/linux/qed/qed_iscsi_if.h @@ -182,7 +182,7 @@ struct qed_iscsi_cb_ops { * @param stats - pointer to struck that would be filled * we stats * @return 0 on success, error otherwise. - * @change_mac Change MAC of interface + * @change_mac: Change MAC of interface * @param cdev * @param handle - the connection handle. * @param mac - new MAC to configure. diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index ff808d248883..5b67cd03276e 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -208,57 +208,57 @@ enum qed_ll2_xmit_flags { struct qed_ll2_ops { /** - * @brief start - initializes ll2 + * start(): Initializes ll2. * - * @param cdev - * @param params - protocol driver configuration for the ll2. + * @cdev: Qed dev pointer. + * @params: Protocol driver configuration for the ll2. * - * @return 0 on success, otherwise error value. + * Return: 0 on success, otherwise error value. */ int (*start)(struct qed_dev *cdev, struct qed_ll2_params *params); /** - * @brief stop - stops the ll2 + * stop(): Stops the ll2 * - * @param cdev + * @cdev: Qed dev pointer. * - * @return 0 on success, otherwise error value. + * Return: 0 on success, otherwise error value. */ int (*stop)(struct qed_dev *cdev); /** - * @brief start_xmit - transmits an skb over the ll2 interface + * start_xmit(): Transmits an skb over the ll2 interface * - * @param cdev - * @param skb - * @param xmit_flags - Transmit options defined by the enum qed_ll2_xmit_flags. + * @cdev: Qed dev pointer. + * @skb: SKB. + * @xmit_flags: Transmit options defined by the enum qed_ll2_xmit_flags. * - * @return 0 on success, otherwise error value. + * Return: 0 on success, otherwise error value. */ int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb, unsigned long xmit_flags); /** - * @brief register_cb_ops - protocol driver register the callback for Rx/Tx + * register_cb_ops(): Protocol driver register the callback for Rx/Tx * packets. Should be called before `start'. * - * @param cdev - * @param cookie - to be passed to the callback functions. - * @param ops - the callback functions to register for Rx / Tx. + * @cdev: Qed dev pointer. + * @cookie: to be passed to the callback functions. + * @ops: the callback functions to register for Rx / Tx. * - * @return 0 on success, otherwise error value. + * Return: 0 on success, otherwise error value. */ void (*register_cb_ops)(struct qed_dev *cdev, const struct qed_ll2_cb_ops *ops, void *cookie); /** - * @brief get LL2 related statistics + * get_stats(): Get LL2 related statistics. * - * @param cdev - * @param stats - pointer to struct that would be filled with stats + * @cdev: Qed dev pointer. + * @stats: Pointer to struct that would be filled with stats. * - * @return 0 on success, error otherwise. + * Return: 0 on success, error otherwise. */ int (*get_stats)(struct qed_dev *cdev, struct qed_ll2_stats *stats); }; diff --git a/include/linux/qed/qed_nvmetcp_if.h b/include/linux/qed/qed_nvmetcp_if.h index 14671bc19ed1..1d51df347560 100644 --- a/include/linux/qed/qed_nvmetcp_if.h +++ b/include/linux/qed/qed_nvmetcp_if.h @@ -171,6 +171,23 @@ struct nvmetcp_task_params { * @param dest_port * @clear_all_filters: Clear all filters. * @param cdev + * @init_read_io: Init read IO. + * @task_params + * @cmd_pdu_header + * @nvme_cmd + * @sgl_task_params + * @init_write_io: Init write IO. + * @task_params + * @cmd_pdu_header + * @nvme_cmd + * @sgl_task_params + * @init_icreq_exchange: Exchange ICReq. + * @task_params + * @init_conn_req_pdu_hdr + * @tx_sgl_task_params + * @rx_sgl_task_params + * @init_task_cleanup: Init task cleanup. + * @task_params */ struct qed_nvmetcp_ops { const struct qed_common_ops *common; -- cgit v1.2.3 From fb09a1ed5c6e507499a9da54bfd34f71a2673961 Mon Sep 17 00:00:00 2001 From: Shai Malin Date: Mon, 4 Oct 2021 09:58:40 +0300 Subject: qed: Remove e4_ and _e4 from FW HSI The existing qed/qede/qedr/qedi/qedf code uses chip-specific naming in structures, functions, variables and defines in FW HSI (Hardware Software Interface). The new FW version introduced a generic naming convention in HSI in-which the same code will be used across different versions for simpler maintainability. It also eases in providing support for new features. With this patch every "_e4" or "e4_" prefix or suffix is not needed anymore and it will be removed. Reviewed-by: Manish Rangankar Reviewed-by: Javed Hasan Signed-off-by: Ariel Elior Signed-off-by: Omkar Kulkarni Signed-off-by: Shai Malin Signed-off-by: Prabhakar Kushwaha Signed-off-by: David S. Miller --- include/linux/qed/common_hsi.h | 28 +-- include/linux/qed/fcoe_common.h | 362 ++++++++++++++++++------------------- include/linux/qed/iscsi_common.h | 360 ++++++++++++++++++------------------ include/linux/qed/nvmetcp_common.h | 18 +- include/linux/qed/qed_if.h | 5 +- 5 files changed, 386 insertions(+), 387 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 0a3807e927c5..3742d1f7d1f7 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -133,7 +133,7 @@ #define NUM_OF_TCS (NUM_OF_PHYS_TCS + 1) /* CIDs */ -#define NUM_OF_CONNECTION_TYPES_E4 (8) +#define NUM_OF_CONNECTION_TYPES (8) #define NUM_OF_LCIDS (320) #define NUM_OF_LTIDS (320) @@ -379,7 +379,7 @@ #define CAU_FSM_ETH_TX 1 /* Number of Protocol Indices per Status Block */ -#define PIS_PER_SB_E4 12 +#define PIS_PER_SB 12 #define MAX_PIS_PER_SB PIS_PER_SB #define CAU_HC_STOPPED_STATE 3 @@ -1221,20 +1221,20 @@ struct rdif_task_context { }; /* Status block structure */ -struct status_block_e4 { - __le16 pi_array[PIS_PER_SB_E4]; +struct status_block { + __le16 pi_array[PIS_PER_SB]; __le32 sb_num; -#define STATUS_BLOCK_E4_SB_NUM_MASK 0x1FF -#define STATUS_BLOCK_E4_SB_NUM_SHIFT 0 -#define STATUS_BLOCK_E4_ZERO_PAD_MASK 0x7F -#define STATUS_BLOCK_E4_ZERO_PAD_SHIFT 9 -#define STATUS_BLOCK_E4_ZERO_PAD2_MASK 0xFFFF -#define STATUS_BLOCK_E4_ZERO_PAD2_SHIFT 16 +#define STATUS_BLOCK_SB_NUM_MASK 0x1FF +#define STATUS_BLOCK_SB_NUM_SHIFT 0 +#define STATUS_BLOCK_ZERO_PAD_MASK 0x7F +#define STATUS_BLOCK_ZERO_PAD_SHIFT 9 +#define STATUS_BLOCK_ZERO_PAD2_MASK 0xFFFF +#define STATUS_BLOCK_ZERO_PAD2_SHIFT 16 __le32 prod_index; -#define STATUS_BLOCK_E4_PROD_INDEX_MASK 0xFFFFFF -#define STATUS_BLOCK_E4_PROD_INDEX_SHIFT 0 -#define STATUS_BLOCK_E4_ZERO_PAD3_MASK 0xFF -#define STATUS_BLOCK_E4_ZERO_PAD3_SHIFT 24 +#define STATUS_BLOCK_PROD_INDEX_MASK 0xFFFFFF +#define STATUS_BLOCK_PROD_INDEX_SHIFT 0 +#define STATUS_BLOCK_ZERO_PAD3_MASK 0xFF +#define STATUS_BLOCK_ZERO_PAD3_SHIFT 24 }; /* Tdif context */ diff --git a/include/linux/qed/fcoe_common.h b/include/linux/qed/fcoe_common.h index 68eda1c21cde..7ba0abc867f1 100644 --- a/include/linux/qed/fcoe_common.h +++ b/include/linux/qed/fcoe_common.h @@ -150,49 +150,49 @@ struct ystorm_fcoe_task_st_ctx { u8 reserved2[8]; }; -struct e4_ystorm_fcoe_task_ag_ctx { +struct ystorm_fcoe_task_ag_ctx { u8 byte0; u8 byte1; __le16 word0; u8 flags0; -#define E4_YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_MASK 0xF -#define E4_YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_SHIFT 0 -#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT0_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT0_SHIFT 4 -#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT1_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT 5 -#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT2_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT 6 -#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT3_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT 7 +#define YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_MASK 0xF +#define YSTORM_FCOE_TASK_AG_CTX_NIBBLE0_SHIFT 0 +#define YSTORM_FCOE_TASK_AG_CTX_BIT0_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_BIT0_SHIFT 4 +#define YSTORM_FCOE_TASK_AG_CTX_BIT1_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT 5 +#define YSTORM_FCOE_TASK_AG_CTX_BIT2_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT 6 +#define YSTORM_FCOE_TASK_AG_CTX_BIT3_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT 7 u8 flags1; -#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0_MASK 0x3 -#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0_SHIFT 0 -#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1_MASK 0x3 -#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT 2 -#define E4_YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_MASK 0x3 -#define E4_YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_SHIFT 4 -#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0EN_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT 6 -#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT 7 +#define YSTORM_FCOE_TASK_AG_CTX_CF0_MASK 0x3 +#define YSTORM_FCOE_TASK_AG_CTX_CF0_SHIFT 0 +#define YSTORM_FCOE_TASK_AG_CTX_CF1_MASK 0x3 +#define YSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT 2 +#define YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_MASK 0x3 +#define YSTORM_FCOE_TASK_AG_CTX_CF2SPECIAL_SHIFT 4 +#define YSTORM_FCOE_TASK_AG_CTX_CF0EN_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT 6 +#define YSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT 7 u8 flags2; -#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT4_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_BIT4_SHIFT 0 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT 1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT 2 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT 3 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT 4 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT 5 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT 6 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK 0x1 -#define E4_YSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT 7 +#define YSTORM_FCOE_TASK_AG_CTX_BIT4_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_BIT4_SHIFT 0 +#define YSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT 1 +#define YSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT 2 +#define YSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT 3 +#define YSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT 4 +#define YSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT 5 +#define YSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT 6 +#define YSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK 0x1 +#define YSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT 7 u8 byte2; __le32 reg0; u8 byte3; @@ -206,73 +206,73 @@ struct e4_ystorm_fcoe_task_ag_ctx { __le32 reg2; }; -struct e4_tstorm_fcoe_task_ag_ctx { +struct tstorm_fcoe_task_ag_ctx { u8 reserved; u8 byte1; __le16 icid; u8 flags0; -#define E4_TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF -#define E4_TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 -#define E4_TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 -#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT1_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT 5 -#define E4_TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_SHIFT 6 -#define E4_TSTORM_FCOE_TASK_AG_CTX_VALID_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_VALID_SHIFT 7 +#define TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF +#define TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 +#define TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 +#define TSTORM_FCOE_TASK_AG_CTX_BIT1_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT 5 +#define TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_WAIT_ABTS_RSP_F_SHIFT 6 +#define TSTORM_FCOE_TASK_AG_CTX_VALID_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_VALID_SHIFT 7 u8 flags1; -#define E4_TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_SHIFT 0 -#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT5_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_BIT5_SHIFT 1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_MASK 0x3 -#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_SHIFT 2 -#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_MASK 0x3 -#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_SHIFT 4 -#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2_MASK 0x3 -#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT 6 +#define TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_FALSE_RR_TOV_SHIFT 0 +#define TSTORM_FCOE_TASK_AG_CTX_BIT5_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_BIT5_SHIFT 1 +#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_MASK 0x3 +#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_SHIFT 2 +#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_MASK 0x3 +#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_SHIFT 4 +#define TSTORM_FCOE_TASK_AG_CTX_CF2_MASK 0x3 +#define TSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT 6 u8 flags2; -#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_MASK 0x3 -#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_SHIFT 0 -#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK 0x3 -#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT 2 -#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_MASK 0x3 -#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_SHIFT 4 -#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_MASK 0x3 -#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_SHIFT 6 +#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_MASK 0x3 +#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_SHIFT 0 +#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK 0x3 +#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT 2 +#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_MASK 0x3 +#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_SHIFT 4 +#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_MASK 0x3 +#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_SHIFT 6 u8 flags3; -#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_MASK 0x3 -#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_SHIFT 0 -#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_SHIFT 2 -#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_SHIFT 3 -#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT 4 -#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_SHIFT 5 -#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT 6 -#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_SHIFT 7 +#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_MASK 0x3 +#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_SHIFT 0 +#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_REC_RR_TOV_CF_EN_SHIFT 2 +#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_ED_TOV_CF_EN_SHIFT 3 +#define TSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT 4 +#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_TIMER_STOP_ALL_EN_SHIFT 5 +#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT 6 +#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_SEQ_INIT_CF_EN_SHIFT 7 u8 flags4; -#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_SHIFT 0 -#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_SHIFT 1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT 2 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT 3 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT 4 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT 5 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT 6 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK 0x1 -#define E4_TSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT 7 +#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_SEQ_RECOVERY_CF_EN_SHIFT 0 +#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_UNSOL_COMP_CF_EN_SHIFT 1 +#define TSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT 2 +#define TSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT 3 +#define TSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT 4 +#define TSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT 5 +#define TSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT 6 +#define TSTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK 0x1 +#define TSTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT 7 u8 cleanup_state; __le16 last_sent_tid; __le32 rec_rr_tov_exp_timeout; @@ -352,49 +352,49 @@ struct tstorm_fcoe_task_st_ctx { struct fcoe_tstorm_fcoe_task_st_ctx_read_only read_only; }; -struct e4_mstorm_fcoe_task_ag_ctx { +struct mstorm_fcoe_task_ag_ctx { u8 byte0; u8 byte1; __le16 icid; u8 flags0; -#define E4_MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF -#define E4_MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 -#define E4_MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 -#define E4_MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_SHIFT 5 -#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT2_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT 6 -#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT3_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT 7 +#define MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF +#define MSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 +#define MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 +#define MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_CQE_PLACED_SHIFT 5 +#define MSTORM_FCOE_TASK_AG_CTX_BIT2_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_BIT2_SHIFT 6 +#define MSTORM_FCOE_TASK_AG_CTX_BIT3_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_BIT3_SHIFT 7 u8 flags1; -#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK 0x3 -#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT 0 -#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1_MASK 0x3 -#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT 2 -#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2_MASK 0x3 -#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT 4 -#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT 6 -#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT 7 +#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_MASK 0x3 +#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_SHIFT 0 +#define MSTORM_FCOE_TASK_AG_CTX_CF1_MASK 0x3 +#define MSTORM_FCOE_TASK_AG_CTX_CF1_SHIFT 2 +#define MSTORM_FCOE_TASK_AG_CTX_CF2_MASK 0x3 +#define MSTORM_FCOE_TASK_AG_CTX_CF2_SHIFT 4 +#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_EX_CLEANUP_CF_EN_SHIFT 6 +#define MSTORM_FCOE_TASK_AG_CTX_CF1EN_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT 7 u8 flags2; -#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT 0 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT 1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT 2 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT 3 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT 4 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT 5 -#define E4_MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_SHIFT 6 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK 0x1 -#define E4_MSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT 7 +#define MSTORM_FCOE_TASK_AG_CTX_CF2EN_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT 0 +#define MSTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT 1 +#define MSTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT 2 +#define MSTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT 3 +#define MSTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT 4 +#define MSTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT 5 +#define MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_XFER_PLACEMENT_EN_SHIFT 6 +#define MSTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK 0x1 +#define MSTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT 7 u8 cleanup_state; __le32 received_bytes; u8 byte3; @@ -440,56 +440,56 @@ struct mstorm_fcoe_task_st_ctx { struct scsi_cached_sges data_desc; }; -struct e4_ustorm_fcoe_task_ag_ctx { +struct ustorm_fcoe_task_ag_ctx { u8 reserved; u8 byte1; __le16 icid; u8 flags0; -#define E4_USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF -#define E4_USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 -#define E4_USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 -#define E4_USTORM_FCOE_TASK_AG_CTX_BIT1_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT 5 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF0_MASK 0x3 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF0_SHIFT 6 +#define USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF +#define USTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 +#define USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 +#define USTORM_FCOE_TASK_AG_CTX_BIT1_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_BIT1_SHIFT 5 +#define USTORM_FCOE_TASK_AG_CTX_CF0_MASK 0x3 +#define USTORM_FCOE_TASK_AG_CTX_CF0_SHIFT 6 u8 flags1; -#define E4_USTORM_FCOE_TASK_AG_CTX_CF1_MASK 0x3 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF1_SHIFT 0 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF2_MASK 0x3 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF2_SHIFT 2 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF3_MASK 0x3 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF3_SHIFT 4 -#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_MASK 0x3 -#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_SHIFT 6 +#define USTORM_FCOE_TASK_AG_CTX_CF1_MASK 0x3 +#define USTORM_FCOE_TASK_AG_CTX_CF1_SHIFT 0 +#define USTORM_FCOE_TASK_AG_CTX_CF2_MASK 0x3 +#define USTORM_FCOE_TASK_AG_CTX_CF2_SHIFT 2 +#define USTORM_FCOE_TASK_AG_CTX_CF3_MASK 0x3 +#define USTORM_FCOE_TASK_AG_CTX_CF3_SHIFT 4 +#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_MASK 0x3 +#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_SHIFT 6 u8 flags2; -#define E4_USTORM_FCOE_TASK_AG_CTX_CF0EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT 0 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF1EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT 1 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF2EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT 2 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF3EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_CF3EN_SHIFT 3 -#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT 4 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT 5 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT 6 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT 7 +#define USTORM_FCOE_TASK_AG_CTX_CF0EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_CF0EN_SHIFT 0 +#define USTORM_FCOE_TASK_AG_CTX_CF1EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_CF1EN_SHIFT 1 +#define USTORM_FCOE_TASK_AG_CTX_CF2EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_CF2EN_SHIFT 2 +#define USTORM_FCOE_TASK_AG_CTX_CF3EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_CF3EN_SHIFT 3 +#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT 4 +#define USTORM_FCOE_TASK_AG_CTX_RULE0EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_RULE0EN_SHIFT 5 +#define USTORM_FCOE_TASK_AG_CTX_RULE1EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_RULE1EN_SHIFT 6 +#define USTORM_FCOE_TASK_AG_CTX_RULE2EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_RULE2EN_SHIFT 7 u8 flags3; -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT 0 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT 1 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT 2 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK 0x1 -#define E4_USTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT 3 -#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_MASK 0xF -#define E4_USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT 4 +#define USTORM_FCOE_TASK_AG_CTX_RULE3EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_RULE3EN_SHIFT 0 +#define USTORM_FCOE_TASK_AG_CTX_RULE4EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_RULE4EN_SHIFT 1 +#define USTORM_FCOE_TASK_AG_CTX_RULE5EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_RULE5EN_SHIFT 2 +#define USTORM_FCOE_TASK_AG_CTX_RULE6EN_MASK 0x1 +#define USTORM_FCOE_TASK_AG_CTX_RULE6EN_SHIFT 3 +#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_MASK 0xF +#define USTORM_FCOE_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT 4 __le32 dif_err_intervals; __le32 dif_error_1st_interval; __le32 global_cq_num; @@ -499,18 +499,18 @@ struct e4_ustorm_fcoe_task_ag_ctx { }; /* FCoE task context */ -struct e4_fcoe_task_context { +struct fcoe_task_context { struct ystorm_fcoe_task_st_ctx ystorm_st_context; struct regpair ystorm_st_padding[2]; struct tdif_task_context tdif_context; - struct e4_ystorm_fcoe_task_ag_ctx ystorm_ag_context; - struct e4_tstorm_fcoe_task_ag_ctx tstorm_ag_context; + struct ystorm_fcoe_task_ag_ctx ystorm_ag_context; + struct tstorm_fcoe_task_ag_ctx tstorm_ag_context; struct timers_context timer_context; struct tstorm_fcoe_task_st_ctx tstorm_st_context; struct regpair tstorm_st_padding[2]; - struct e4_mstorm_fcoe_task_ag_ctx mstorm_ag_context; + struct mstorm_fcoe_task_ag_ctx mstorm_ag_context; struct mstorm_fcoe_task_st_ctx mstorm_st_context; - struct e4_ustorm_fcoe_task_ag_ctx ustorm_ag_context; + struct ustorm_fcoe_task_ag_ctx ustorm_ag_context; struct rdif_task_context rdif_context; }; diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h index 157019f716f1..1a60285a01e3 100644 --- a/include/linux/qed/iscsi_common.h +++ b/include/linux/qed/iscsi_common.h @@ -714,49 +714,49 @@ struct ystorm_iscsi_task_st_ctx { union iscsi_task_hdr pdu_hdr; }; -struct e4_ystorm_iscsi_task_ag_ctx { +struct ystorm_iscsi_task_ag_ctx { u8 reserved; u8 byte1; __le16 word0; u8 flags0; -#define E4_YSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_MASK 0xF -#define E4_YSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_SHIFT 0 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT0_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT0_SHIFT 4 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT 5 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_VALID_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT 6 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_MASK 0x1 /* bit3 */ -#define E4_YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_SHIFT 7 +#define YSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_MASK 0xF +#define YSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_SHIFT 0 +#define YSTORM_ISCSI_TASK_AG_CTX_BIT0_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_BIT0_SHIFT 4 +#define YSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT 5 +#define YSTORM_ISCSI_TASK_AG_CTX_VALID_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT 6 +#define YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_MASK 0x1 /* bit3 */ +#define YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_SHIFT 7 u8 flags1; -#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0_MASK 0x3 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0_SHIFT 0 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1_MASK 0x3 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT 2 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF2SPECIAL_MASK 0x3 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF2SPECIAL_SHIFT 4 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0EN_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0EN_SHIFT 6 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT 7 +#define YSTORM_ISCSI_TASK_AG_CTX_CF0_MASK 0x3 +#define YSTORM_ISCSI_TASK_AG_CTX_CF0_SHIFT 0 +#define YSTORM_ISCSI_TASK_AG_CTX_CF1_MASK 0x3 +#define YSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT 2 +#define YSTORM_ISCSI_TASK_AG_CTX_CF2SPECIAL_MASK 0x3 +#define YSTORM_ISCSI_TASK_AG_CTX_CF2SPECIAL_SHIFT 4 +#define YSTORM_ISCSI_TASK_AG_CTX_CF0EN_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_CF0EN_SHIFT 6 +#define YSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT 7 u8 flags2; -#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT4_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT4_SHIFT 0 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT 1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT 2 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT 3 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT 4 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT 5 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT 6 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT 7 +#define YSTORM_ISCSI_TASK_AG_CTX_BIT4_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_BIT4_SHIFT 0 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT 1 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT 2 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT 3 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT 4 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT 5 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT 6 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK 0x1 +#define YSTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT 7 u8 byte2; __le32 TTT; u8 byte3; @@ -764,49 +764,49 @@ struct e4_ystorm_iscsi_task_ag_ctx { __le16 word1; }; -struct e4_mstorm_iscsi_task_ag_ctx { +struct mstorm_iscsi_task_ag_ctx { u8 cdu_validation; u8 byte1; __le16 task_cid; u8 flags0; -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT 5 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT 6 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_SHIFT 7 +#define MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF +#define MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 +#define MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 +#define MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT 5 +#define MSTORM_ISCSI_TASK_AG_CTX_VALID_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT 6 +#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_SHIFT 7 u8 flags1; -#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_MASK 0x3 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_SHIFT 0 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1_MASK 0x3 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT 2 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2_MASK 0x3 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2_SHIFT 4 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_EN_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_EN_SHIFT 6 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT 7 +#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_MASK 0x3 +#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_SHIFT 0 +#define MSTORM_ISCSI_TASK_AG_CTX_CF1_MASK 0x3 +#define MSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT 2 +#define MSTORM_ISCSI_TASK_AG_CTX_CF2_MASK 0x3 +#define MSTORM_ISCSI_TASK_AG_CTX_CF2_SHIFT 4 +#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_EN_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_CF_EN_SHIFT 6 +#define MSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT 7 u8 flags2; -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2EN_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_CF2EN_SHIFT 0 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT 1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT 2 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT 3 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT 4 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT 5 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT 6 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK 0x1 -#define E4_MSTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT 7 +#define MSTORM_ISCSI_TASK_AG_CTX_CF2EN_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_CF2EN_SHIFT 0 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT 1 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT 2 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT 3 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT 4 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT 5 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT 6 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK 0x1 +#define MSTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT 7 u8 byte2; __le32 reg0; u8 byte3; @@ -814,56 +814,56 @@ struct e4_mstorm_iscsi_task_ag_ctx { __le16 word1; }; -struct e4_ustorm_iscsi_task_ag_ctx { +struct ustorm_iscsi_task_ag_ctx { u8 reserved; u8 state; __le16 icid; u8 flags0; -#define E4_USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF -#define E4_USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 -#define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 -#define E4_USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT 5 -#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_MASK 0x3 -#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_SHIFT 6 +#define USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_MASK 0xF +#define USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT 0 +#define USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT 4 +#define USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT 5 +#define USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_MASK 0x3 +#define USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_SHIFT 6 u8 flags1; -#define E4_USTORM_ISCSI_TASK_AG_CTX_RESERVED1_MASK 0x3 -#define E4_USTORM_ISCSI_TASK_AG_CTX_RESERVED1_SHIFT 0 -#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_MASK 0x3 -#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_SHIFT 2 -#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3_MASK 0x3 -#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3_SHIFT 4 -#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_MASK 0x3 -#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_SHIFT 6 +#define USTORM_ISCSI_TASK_AG_CTX_RESERVED1_MASK 0x3 +#define USTORM_ISCSI_TASK_AG_CTX_RESERVED1_SHIFT 0 +#define USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_MASK 0x3 +#define USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_SHIFT 2 +#define USTORM_ISCSI_TASK_AG_CTX_CF3_MASK 0x3 +#define USTORM_ISCSI_TASK_AG_CTX_CF3_SHIFT 4 +#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_MASK 0x3 +#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_SHIFT 6 u8 flags2; -#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_EN_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_EN_SHIFT 0 -#define E4_USTORM_ISCSI_TASK_AG_CTX_DISABLE_DATA_ACKED_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_DISABLE_DATA_ACKED_SHIFT 1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_EN_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_EN_SHIFT 2 -#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3EN_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_CF3EN_SHIFT 3 -#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT 4 -#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_SHIFT 5 -#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT 6 -#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_SHIFT 7 +#define USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_EN_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_EN_SHIFT 0 +#define USTORM_ISCSI_TASK_AG_CTX_DISABLE_DATA_ACKED_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_DISABLE_DATA_ACKED_SHIFT 1 +#define USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_EN_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_R2T2RECV_EN_SHIFT 2 +#define USTORM_ISCSI_TASK_AG_CTX_CF3EN_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_CF3EN_SHIFT 3 +#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN_SHIFT 4 +#define USTORM_ISCSI_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_CMP_DATA_TOTAL_EXP_EN_SHIFT 5 +#define USTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT 6 +#define USTORM_ISCSI_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_SHIFT 7 u8 flags3; -#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT 0 -#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT 1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT 2 -#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK 0x1 -#define E4_USTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT 3 -#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_TYPE_MASK 0xF -#define E4_USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT 4 +#define USTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT 0 +#define USTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT 1 +#define USTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT 2 +#define USTORM_ISCSI_TASK_AG_CTX_RULE6EN_MASK 0x1 +#define USTORM_ISCSI_TASK_AG_CTX_RULE6EN_SHIFT 3 +#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_TYPE_MASK 0xF +#define USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT 4 __le32 dif_err_intervals; __le32 dif_error_1st_interval; __le32 rcv_cont_len; @@ -952,14 +952,14 @@ struct ustorm_iscsi_task_st_ctx { }; /* iscsi task context */ -struct e4_iscsi_task_context { +struct iscsi_task_context { struct ystorm_iscsi_task_st_ctx ystorm_st_context; - struct e4_ystorm_iscsi_task_ag_ctx ystorm_ag_context; + struct ystorm_iscsi_task_ag_ctx ystorm_ag_context; struct regpair ystorm_ag_padding[2]; struct tdif_task_context tdif_context; - struct e4_mstorm_iscsi_task_ag_ctx mstorm_ag_context; + struct mstorm_iscsi_task_ag_ctx mstorm_ag_context; struct regpair mstorm_ag_padding[2]; - struct e4_ustorm_iscsi_task_ag_ctx ustorm_ag_context; + struct ustorm_iscsi_task_ag_ctx ustorm_ag_context; struct mstorm_iscsi_task_st_ctx mstorm_st_context; struct ustorm_iscsi_task_st_ctx ustorm_st_context; struct rdif_task_context rdif_context; @@ -1431,73 +1431,73 @@ struct ystorm_iscsi_stats_drv { struct regpair iscsi_tx_tcp_pkt_cnt; }; -struct e4_tstorm_iscsi_task_ag_ctx { +struct tstorm_iscsi_task_ag_ctx { u8 byte0; u8 byte1; __le16 word0; u8 flags0; -#define E4_TSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_MASK 0xF -#define E4_TSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_SHIFT 0 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT0_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT0_SHIFT 4 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT 5 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT2_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT2_SHIFT 6 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT3_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT3_SHIFT 7 +#define TSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_MASK 0xF +#define TSTORM_ISCSI_TASK_AG_CTX_NIBBLE0_SHIFT 0 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT0_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT0_SHIFT 4 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT 5 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT2_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT2_SHIFT 6 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT3_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT3_SHIFT 7 u8 flags1; -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT4_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT4_SHIFT 0 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT5_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_BIT5_SHIFT 1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0_MASK 0x3 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0_SHIFT 2 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1_MASK 0x3 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT 4 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2_MASK 0x3 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2_SHIFT 6 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT4_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT4_SHIFT 0 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT5_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_BIT5_SHIFT 1 +#define TSTORM_ISCSI_TASK_AG_CTX_CF0_MASK 0x3 +#define TSTORM_ISCSI_TASK_AG_CTX_CF0_SHIFT 2 +#define TSTORM_ISCSI_TASK_AG_CTX_CF1_MASK 0x3 +#define TSTORM_ISCSI_TASK_AG_CTX_CF1_SHIFT 4 +#define TSTORM_ISCSI_TASK_AG_CTX_CF2_MASK 0x3 +#define TSTORM_ISCSI_TASK_AG_CTX_CF2_SHIFT 6 u8 flags2; -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3_MASK 0x3 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3_SHIFT 0 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4_MASK 0x3 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4_SHIFT 2 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5_MASK 0x3 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5_SHIFT 4 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6_MASK 0x3 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6_SHIFT 6 +#define TSTORM_ISCSI_TASK_AG_CTX_CF3_MASK 0x3 +#define TSTORM_ISCSI_TASK_AG_CTX_CF3_SHIFT 0 +#define TSTORM_ISCSI_TASK_AG_CTX_CF4_MASK 0x3 +#define TSTORM_ISCSI_TASK_AG_CTX_CF4_SHIFT 2 +#define TSTORM_ISCSI_TASK_AG_CTX_CF5_MASK 0x3 +#define TSTORM_ISCSI_TASK_AG_CTX_CF5_SHIFT 4 +#define TSTORM_ISCSI_TASK_AG_CTX_CF6_MASK 0x3 +#define TSTORM_ISCSI_TASK_AG_CTX_CF6_SHIFT 6 u8 flags3; -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7_MASK 0x3 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7_SHIFT 0 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF0EN_SHIFT 2 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT 3 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF2EN_SHIFT 4 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF3EN_SHIFT 5 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF4EN_SHIFT 6 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF5EN_SHIFT 7 +#define TSTORM_ISCSI_TASK_AG_CTX_CF7_MASK 0x3 +#define TSTORM_ISCSI_TASK_AG_CTX_CF7_SHIFT 0 +#define TSTORM_ISCSI_TASK_AG_CTX_CF0EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_CF0EN_SHIFT 2 +#define TSTORM_ISCSI_TASK_AG_CTX_CF1EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_CF1EN_SHIFT 3 +#define TSTORM_ISCSI_TASK_AG_CTX_CF2EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_CF2EN_SHIFT 4 +#define TSTORM_ISCSI_TASK_AG_CTX_CF3EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_CF3EN_SHIFT 5 +#define TSTORM_ISCSI_TASK_AG_CTX_CF4EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_CF4EN_SHIFT 6 +#define TSTORM_ISCSI_TASK_AG_CTX_CF5EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_CF5EN_SHIFT 7 u8 flags4; -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF6EN_SHIFT 0 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_CF7EN_SHIFT 1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT 2 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT 3 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT 4 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT 5 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT 6 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK 0x1 -#define E4_TSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT 7 +#define TSTORM_ISCSI_TASK_AG_CTX_CF6EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_CF6EN_SHIFT 0 +#define TSTORM_ISCSI_TASK_AG_CTX_CF7EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_CF7EN_SHIFT 1 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE0EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE0EN_SHIFT 2 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE1EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE1EN_SHIFT 3 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE2EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE2EN_SHIFT 4 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE3EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE3EN_SHIFT 5 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE4EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE4EN_SHIFT 6 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE5EN_MASK 0x1 +#define TSTORM_ISCSI_TASK_AG_CTX_RULE5EN_SHIFT 7 u8 byte2; __le16 word1; __le32 reg0; diff --git a/include/linux/qed/nvmetcp_common.h b/include/linux/qed/nvmetcp_common.h index 5a2ab0606308..cc7c7481a0e0 100644 --- a/include/linux/qed/nvmetcp_common.h +++ b/include/linux/qed/nvmetcp_common.h @@ -410,7 +410,7 @@ struct e5_ystorm_nvmetcp_task_ag_ctx { u8 byte2; u8 byte3; u8 byte4; - u8 e4_reserved7; + u8 reserved7; }; struct e5_mstorm_nvmetcp_task_ag_ctx { @@ -445,7 +445,7 @@ struct e5_mstorm_nvmetcp_task_ag_ctx { u8 byte2; u8 byte3; u8 byte4; - u8 e4_reserved7; + u8 reserved7; }; struct e5_ustorm_nvmetcp_task_ag_ctx { @@ -489,17 +489,17 @@ struct e5_ustorm_nvmetcp_task_ag_ctx { #define E5_USTORM_NVMETCP_TASK_AG_CTX_CMP_CONT_RCV_EXP_EN_SHIFT 7 u8 flags3; u8 flags4; -#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED5_MASK 0x3 -#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED5_SHIFT 0 -#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED6_MASK 0x1 -#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED6_SHIFT 2 -#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED7_MASK 0x1 -#define E5_USTORM_NVMETCP_TASK_AG_CTX_E4_RESERVED7_SHIFT 3 +#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED5_MASK 0x3 +#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED5_SHIFT 0 +#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED6_MASK 0x1 +#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED6_SHIFT 2 +#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED7_MASK 0x1 +#define E5_USTORM_NVMETCP_TASK_AG_CTX_RESERVED7_SHIFT 3 #define E5_USTORM_NVMETCP_TASK_AG_CTX_DIF_ERROR_TYPE_MASK 0xF #define E5_USTORM_NVMETCP_TASK_AG_CTX_DIF_ERROR_TYPE_SHIFT 4 u8 byte2; u8 byte3; - u8 e4_reserved8; + u8 reserved8; __le32 dif_err_intervals; __le32 dif_error_1st_interval; __le32 rcv_cont_len; diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index f39451aaaeec..4dcd0d37a521 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -588,7 +588,7 @@ enum qed_int_mode { }; struct qed_sb_info { - struct status_block_e4 *sb_virt; + struct status_block *sb_virt; dma_addr_t sb_phys; u32 sb_ack; /* Last given ack */ u16 igu_sb_id; @@ -613,7 +613,6 @@ enum qed_hw_err_type { enum qed_dev_type { QED_DEV_TYPE_BB, QED_DEV_TYPE_AH, - QED_DEV_TYPE_E5, }; struct qed_dev_info { @@ -1411,7 +1410,7 @@ static inline u16 qed_sb_update_sb_idx(struct qed_sb_info *sb_info) u16 rc = 0; prod = le32_to_cpu(sb_info->sb_virt->prod_index) & - STATUS_BLOCK_E4_PROD_INDEX_MASK; + STATUS_BLOCK_PROD_INDEX_MASK; if (sb_info->sb_ack != prod) { sb_info->sb_ack = prod; rc |= QED_SB_IDX; -- cgit v1.2.3 From 484563e230a8c68ab342080b41b5a5e2ce9621ef Mon Sep 17 00:00:00 2001 From: Prabhakar Kushwaha Date: Mon, 4 Oct 2021 09:58:42 +0300 Subject: qed: Update common_hsi for FW ver 8.59.1.0 The common_hsi.h has been updated for FW version 8.59.1.0 with below changes. - FW and Tools version. - New structures related to search table, packet duplication. - Structure for doorbell address for legacy mode without DEM. - Enhanced union rdma_eqe_data for RoCE Suspend Event Data. - New defines. This patch also fixes the existing checkpatch warnings and few important checks. Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: Omkar Kulkarni Signed-off-by: Prabhakar Kushwaha Signed-off-by: David S. Miller --- include/linux/qed/common_hsi.h | 113 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 105 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 3742d1f7d1f7..827624840ee2 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */ /* QLogic qed NIC Driver * Copyright (c) 2015-2016 QLogic Corporation - * Copyright (c) 2019-2020 Marvell International Ltd. + * Copyright (c) 2019-2021 Marvell International Ltd. */ #ifndef _COMMON_HSI_H @@ -47,10 +47,10 @@ #define ISCSI_CDU_TASK_SEG_TYPE 0 #define FCOE_CDU_TASK_SEG_TYPE 0 #define RDMA_CDU_TASK_SEG_TYPE 1 +#define ETH_CDU_TASK_SEG_TYPE 2 #define FW_ASSERT_GENERAL_ATTN_IDX 32 - /* Queue Zone sizes in bytes */ #define TSTORM_QZONE_SIZE 8 #define MSTORM_QZONE_SIZE 16 @@ -60,9 +60,12 @@ #define PSTORM_QZONE_SIZE 0 #define MSTORM_VF_ZONE_DEFAULT_SIZE_LOG 7 -#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DEFAULT 16 -#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DOUBLE 48 -#define ETH_MAX_NUM_RX_QUEUES_PER_VF_QUAD 112 +#define ETH_MAX_RXQ_VF_DEFAULT 16 +#define ETH_MAX_RXQ_VF_DOUBLE 48 +#define ETH_MAX_RXQ_VF_QUAD 112 + +#define ETH_RGSRC_CTX_SIZE 6 +#define ETH_TGSRC_CTX_SIZE 6 /********************************/ /* CORE (LIGHT L2) FW CONSTANTS */ @@ -89,8 +92,8 @@ #define MAX_NUM_LL2_TX_STATS_COUNTERS 48 #define FW_MAJOR_VERSION 8 -#define FW_MINOR_VERSION 42 -#define FW_REVISION_VERSION 2 +#define FW_MINOR_VERSION 59 +#define FW_REVISION_VERSION 1 #define FW_ENGINEERING_VERSION 0 /***********************/ @@ -112,6 +115,7 @@ #define MAX_NUM_VFS (MAX_NUM_VFS_K2) #define MAX_NUM_FUNCTIONS_BB (MAX_NUM_PFS_BB + MAX_NUM_VFS_BB) +#define MAX_NUM_FUNCTIONS_K2 (MAX_NUM_PFS_K2 + MAX_NUM_VFS_K2) #define MAX_FUNCTION_NUMBER_BB (MAX_NUM_PFS + MAX_NUM_VFS_BB) #define MAX_FUNCTION_NUMBER_K2 (MAX_NUM_PFS + MAX_NUM_VFS_K2) @@ -144,7 +148,7 @@ #define GTT_DWORD_SIZE BIT(GTT_DWORD_SIZE_BITS) /* Tools Version */ -#define TOOLS_VERSION 10 +#define TOOLS_VERSION 11 /*****************/ /* CDU CONSTANTS */ @@ -162,6 +166,7 @@ #define CDU_CONTEXT_VALIDATION_CFG_USE_REGION (3) #define CDU_CONTEXT_VALIDATION_CFG_USE_CID (4) #define CDU_CONTEXT_VALIDATION_CFG_USE_ACTIVE (5) +#define CDU_CONTEXT_VALIDATION_DEFAULT_CFG (0x3d) /*****************/ /* DQ CONSTANTS */ @@ -302,6 +307,9 @@ /* PWM address mapping */ #define DQ_PWM_OFFSET_DPM_BASE 0x0 #define DQ_PWM_OFFSET_DPM_END 0x27 +#define DQ_PWM_OFFSET_XCM32_24ICID_BASE 0x28 +#define DQ_PWM_OFFSET_UCM32_24ICID_BASE 0x30 +#define DQ_PWM_OFFSET_TCM32_24ICID_BASE 0x38 #define DQ_PWM_OFFSET_XCM16_BASE 0x40 #define DQ_PWM_OFFSET_XCM32_BASE 0x44 #define DQ_PWM_OFFSET_UCM16_BASE 0x48 @@ -325,6 +333,13 @@ #define DQ_PWM_OFFSET_TCM_LL2_PROD_UPDATE \ (DQ_PWM_OFFSET_TCM32_BASE + DQ_TCM_AGG_VAL_SEL_REG9 - 4) +#define DQ_PWM_OFFSET_XCM_RDMA_24B_ICID_SQ_PROD \ + (DQ_PWM_OFFSET_XCM32_24ICID_BASE + 2) +#define DQ_PWM_OFFSET_UCM_RDMA_24B_ICID_CQ_CONS_32BIT \ + (DQ_PWM_OFFSET_UCM32_24ICID_BASE + 4) +#define DQ_PWM_OFFSET_TCM_ROCE_24B_ICID_RQ_PROD \ + (DQ_PWM_OFFSET_TCM32_24ICID_BASE + 1) + #define DQ_REGION_SHIFT (12) /* DPM */ @@ -360,6 +375,7 @@ /* Number of global Vport/QCN rate limiters */ #define MAX_QM_GLOBAL_RLS 256 +#define COMMON_MAX_QM_GLOBAL_RLS MAX_QM_GLOBAL_RLS /* QM registers data */ #define QM_LINE_CRD_REG_WIDTH 16 @@ -700,6 +716,13 @@ enum mf_mode { MAX_MF_MODE }; +/* Per protocol packet duplication enable bit vector. If set, duplicate + * offloaded traffic to LL2 debug queueu. + */ +struct offload_pkt_dup_enable { + __le16 enable_vector; +}; + /* Per-protocol connection types */ enum protocol_type { PROTOCOLID_TCP_ULP, @@ -717,6 +740,12 @@ enum protocol_type { MAX_PROTOCOL_TYPE }; +/* Pstorm packet duplication config */ +struct pstorm_pkt_dup_cfg { + struct offload_pkt_dup_enable enable; + __le16 reserved[3]; +}; + struct regpair { __le32 lo; __le32 hi; @@ -728,10 +757,24 @@ struct rdma_eqe_destroy_qp { u8 reserved[4]; }; +/* RoCE Suspend Event Data */ +struct rdma_eqe_suspend_qp { + __le32 cid; + u8 reserved[4]; +}; + /* RDMA Event Data Union */ union rdma_eqe_data { struct regpair async_handle; struct rdma_eqe_destroy_qp rdma_destroy_qp_data; + struct rdma_eqe_suspend_qp rdma_suspend_qp_data; +}; + +/* Tstorm packet duplication config */ +struct tstorm_pkt_dup_cfg { + struct offload_pkt_dup_enable enable; + __le16 reserved; + __le32 cid; }; struct tstorm_queue_zone { @@ -891,6 +934,15 @@ struct db_legacy_addr { #define DB_LEGACY_ADDR_ICID_SHIFT 5 }; +/* Structure for doorbell address, in legacy mode, without DEMS */ +struct db_legacy_wo_dems_addr { + __le32 addr; +#define DB_LEGACY_WO_DEMS_ADDR_RESERVED0_MASK 0x3 +#define DB_LEGACY_WO_DEMS_ADDR_RESERVED0_SHIFT 0 +#define DB_LEGACY_WO_DEMS_ADDR_ICID_MASK 0x3FFFFFFF +#define DB_LEGACY_WO_DEMS_ADDR_ICID_SHIFT 2 +}; + /* Structure for doorbell address, in PWM mode */ struct db_pwm_addr { __le32 addr; @@ -906,6 +958,31 @@ struct db_pwm_addr { #define DB_PWM_ADDR_RESERVED1_SHIFT 28 }; +/* Parameters to RDMA firmware, passed in EDPM doorbell */ +struct db_rdma_24b_icid_dpm_params { + __le32 params; +#define DB_RDMA_24B_ICID_DPM_PARAMS_SIZE_MASK 0x3F +#define DB_RDMA_24B_ICID_DPM_PARAMS_SIZE_SHIFT 0 +#define DB_RDMA_24B_ICID_DPM_PARAMS_DPM_TYPE_MASK 0x3 +#define DB_RDMA_24B_ICID_DPM_PARAMS_DPM_TYPE_SHIFT 6 +#define DB_RDMA_24B_ICID_DPM_PARAMS_OPCODE_MASK 0xFF +#define DB_RDMA_24B_ICID_DPM_PARAMS_OPCODE_SHIFT 8 +#define DB_RDMA_24B_ICID_DPM_PARAMS_ICID_EXT_MASK 0xFF +#define DB_RDMA_24B_ICID_DPM_PARAMS_ICID_EXT_SHIFT 16 +#define DB_RDMA_24B_ICID_DPM_PARAMS_INV_BYTE_CNT_MASK 0x7 +#define DB_RDMA_24B_ICID_DPM_PARAMS_INV_BYTE_CNT_SHIFT 24 +#define DB_RDMA_24B_ICID_DPM_PARAMS_EXT_ICID_MODE_EN_MASK 0x1 +#define DB_RDMA_24B_ICID_DPM_PARAMS_EXT_ICID_MODE_EN_SHIFT 27 +#define DB_RDMA_24B_ICID_DPM_PARAMS_COMPLETION_FLG_MASK 0x1 +#define DB_RDMA_24B_ICID_DPM_PARAMS_COMPLETION_FLG_SHIFT 28 +#define DB_RDMA_24B_ICID_DPM_PARAMS_S_FLG_MASK 0x1 +#define DB_RDMA_24B_ICID_DPM_PARAMS_S_FLG_SHIFT 29 +#define DB_RDMA_24B_ICID_DPM_PARAMS_RESERVED1_MASK 0x1 +#define DB_RDMA_24B_ICID_DPM_PARAMS_RESERVED1_SHIFT 30 +#define DB_RDMA_24B_ICID_DPM_PARAMS_CONN_TYPE_IS_IWARP_MASK 0x1 +#define DB_RDMA_24B_ICID_DPM_PARAMS_CONN_TYPE_IS_IWARP_SHIFT 31 +}; + /* Parameters to RDMA firmware, passed in EDPM doorbell */ struct db_rdma_dpm_params { __le32 params; @@ -1220,6 +1297,26 @@ struct rdif_task_context { __le32 reserved2; }; +/* Searcher Table struct */ +struct src_entry_header { + __le32 flags; +#define SRC_ENTRY_HEADER_NEXT_PTR_TYPE_MASK 0x1 +#define SRC_ENTRY_HEADER_NEXT_PTR_TYPE_SHIFT 0 +#define SRC_ENTRY_HEADER_EMPTY_MASK 0x1 +#define SRC_ENTRY_HEADER_EMPTY_SHIFT 1 +#define SRC_ENTRY_HEADER_RESERVED_MASK 0x3FFFFFFF +#define SRC_ENTRY_HEADER_RESERVED_SHIFT 2 + __le32 magic_number; + struct regpair next_ptr; +}; + +/* Enumeration for address type */ +enum src_header_next_ptr_type_enum { + e_physical_addr, + e_logical_addr, + MAX_SRC_HEADER_NEXT_PTR_TYPE_ENUM +}; + /* Status block structure */ struct status_block { __le16 pi_array[PIS_PER_SB]; -- cgit v1.2.3 From fe40a830dcded26f012739fd6dac0da9c805bc38 Mon Sep 17 00:00:00 2001 From: Prabhakar Kushwaha Date: Mon, 4 Oct 2021 09:58:44 +0300 Subject: qed: Update qed_hsi.h for fw 8.59.1.0 The qed_hsi.h has been updated to support new FW version 8.59.1.0 with changes. - Updates FW HSI (Hardware Software interface) structures. - Addition/update in function declaration and defines as per HSI. - Add generic infrastructure for FW error reporting as part of common event queue handling. - Move malicious VF error reporting to FW error reporting infrastructure. - Move consolidation queue initialization from FW context to ramrod message. qed_hsi.h header file changes lead to change in many files to ensure compilation. This patch also fixes the existing checkpatch warnings and few important checks. Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: Omkar Kulkarni Signed-off-by: Prabhakar Kushwaha Signed-off-by: David S. Miller --- include/linux/qed/eth_common.h | 1 + include/linux/qed/rdma_common.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/eth_common.h b/include/linux/qed/eth_common.h index cd1207ad4ada..c84e08bc6802 100644 --- a/include/linux/qed/eth_common.h +++ b/include/linux/qed/eth_common.h @@ -67,6 +67,7 @@ /* Ethernet vport update constants */ #define ETH_FILTER_RULES_COUNT 10 #define ETH_RSS_IND_TABLE_ENTRIES_NUM 128 +#define ETH_RSS_IND_TABLE_MASK_SIZE_REGS (ETH_RSS_IND_TABLE_ENTRIES_NUM / 32) #define ETH_RSS_KEY_SIZE_REGS 10 #define ETH_RSS_ENGINE_NUM_K2 207 #define ETH_RSS_ENGINE_NUM_BB 127 diff --git a/include/linux/qed/rdma_common.h b/include/linux/qed/rdma_common.h index bab078b25834..6dfed163ab6c 100644 --- a/include/linux/qed/rdma_common.h +++ b/include/linux/qed/rdma_common.h @@ -27,6 +27,7 @@ #define RDMA_MAX_PDS (64 * 1024) #define RDMA_MAX_XRC_SRQS (1024) #define RDMA_MAX_SRQS (32 * 1024) +#define RDMA_MAX_IRQ_ELEMS_IN_PAGE (128) #define RDMA_NUM_STATISTIC_COUNTERS MAX_NUM_VPORTS #define RDMA_NUM_STATISTIC_COUNTERS_K2 MAX_NUM_VPORTS_K2 -- cgit v1.2.3 From 3a6f5d0cbda37f24f60ca778bd1675125b7d594f Mon Sep 17 00:00:00 2001 From: Nikolay Assa Date: Mon, 4 Oct 2021 09:58:49 +0300 Subject: qed: Update TCP silly-window-syndrome timeout for iwarp, scsi Update TCP silly-window-syndrome timeout, for the cases where initiator's small TCP window size prevents FW from transmitting packets on the connection. Timeout causes FW to retransmit window probes if needed, preventing I/O stall if initiator ignores first window probe. Reviewed-by: Manish Rangankar Signed-off-by: Nikolay Assa Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: Omkar Kulkarni Signed-off-by: Prabhakar Kushwaha Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 4dcd0d37a521..07f8d19421ab 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -24,6 +24,8 @@ #include #include +#define QED_TX_SWS_TIMER_DFLT 500 + enum dcbx_protocol_type { DCBX_PROTOCOL_ISCSI, DCBX_PROTOCOL_FCOE, -- cgit v1.2.3 From a64aa0a8b991e3cae61ee8322cf936c00068a5c1 Mon Sep 17 00:00:00 2001 From: Prabhakar Kushwaha Date: Mon, 4 Oct 2021 09:58:50 +0300 Subject: qed: Update the TCP active termination 2 MSL timer ("TIME_WAIT") Initialize 2 MSL timeout value used for the TCP TIME_WAIT state to non-zero default. This patch also removes magic number from qedi/qedi_main.c. Reviewed-by: Manish Rangankar Signed-off-by: Nikolay Assa Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: Omkar Kulkarni Signed-off-by: Prabhakar Kushwaha Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 07f8d19421ab..ad220d5da18f 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -25,6 +25,7 @@ #include #define QED_TX_SWS_TIMER_DFLT 500 +#define QED_TWO_MSL_TIMER_DFLT 4000 enum dcbx_protocol_type { DCBX_PROTOCOL_ISCSI, -- cgit v1.2.3 From a2c27a61b4335344c1bacaade61e9b2dc6e12a76 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 4 Oct 2021 12:03:28 +0100 Subject: net: phylink: add phylink_set_10g_modes() helper Add a helper for setting 10Gigabit modes, so we have one central place that sets all appropriate 10G modes for a driver. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 237291196ce2..f7b5ed06a815 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -484,6 +484,7 @@ int phylink_speed_up(struct phylink *pl); #define phylink_test(bm, mode) __phylink_do_bit(test_bit, bm, mode) void phylink_set_port_modes(unsigned long *bits); +void phylink_set_10g_modes(unsigned long *mask); void phylink_helper_basex_speed(struct phylink_link_state *state); void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs, -- cgit v1.2.3 From 571e5c0efcb29c5dac8cf2949d3eed84ec43056c Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 19 May 2021 16:00:22 -0400 Subject: audit: add OPENAT2 record to list "how" info Since the openat2(2) syscall uses a struct open_how pointer to communicate its parameters they are not usefully recorded by the audit SYSCALL record's four existing arguments. Add a new audit record type OPENAT2 that reports the parameters in its third argument, struct open_how with fields oflag, mode and resolve. The new record in the context of an event would look like: time->Wed Mar 17 16:28:53 2021 type=PROCTITLE msg=audit(1616012933.531:184): proctitle= 73797363616C6C735F66696C652F6F70656E617432002F746D702F61756469742D 7465737473756974652D737641440066696C652D6F70656E617432 type=PATH msg=audit(1616012933.531:184): item=1 name="file-openat2" inode=29 dev=00:1f mode=0100600 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmp_t:s0 nametype=CREATE cap_fp=0 cap_fi=0 cap_fe=0 cap_fver=0 cap_frootid=0 type=PATH msg=audit(1616012933.531:184): item=0 name="/root/rgb/git/audit-testsuite/tests" inode=25 dev=00:1f mode=040700 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmp_t:s0 nametype=PARENT cap_fp=0 cap_fi=0 cap_fe=0 cap_fver=0 cap_frootid=0 type=CWD msg=audit(1616012933.531:184): cwd="/root/rgb/git/audit-testsuite/tests" type=OPENAT2 msg=audit(1616012933.531:184): oflag=0100302 mode=0600 resolve=0xa type=SYSCALL msg=audit(1616012933.531:184): arch=c000003e syscall=437 success=yes exit=4 a0=3 a1=7ffe315f1c53 a2=7ffe315f1550 a3=18 items=2 ppid=528 pid=540 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=ttyS0 ses=1 comm="openat2" exe="/root/rgb/git/audit-testsuite/tests/syscalls_file/openat2" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key="testsuite-1616012933-bjAUcEPO" Link: https://lore.kernel.org/r/d23fbb89186754487850367224b060e26f9b7181.1621363275.git.rgb@redhat.com Signed-off-by: Richard Guy Briggs Acked-by: Christian Brauner [PM: tweak subject, wrap example, move AUDIT_OPENAT2 to 1337] Signed-off-by: Paul Moore --- include/linux/audit.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 5fbeeeb6b726..0e0eb4c0fa10 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -399,6 +399,7 @@ extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *old); extern void __audit_log_capset(const struct cred *new, const struct cred *old); extern void __audit_mmap_fd(int fd, int flags); +extern void __audit_openat2_how(struct open_how *how); extern void __audit_log_kern_module(char *name); extern void __audit_fanotify(unsigned int response); extern void __audit_tk_injoffset(struct timespec64 offset); @@ -495,6 +496,12 @@ static inline void audit_mmap_fd(int fd, int flags) __audit_mmap_fd(fd, flags); } +static inline void audit_openat2_how(struct open_how *how) +{ + if (unlikely(!audit_dummy_context())) + __audit_openat2_how(how); +} + static inline void audit_log_kern_module(char *name) { if (!audit_dummy_context()) @@ -646,6 +653,9 @@ static inline void audit_log_capset(const struct cred *new, static inline void audit_mmap_fd(int fd, int flags) { } +static inline void audit_openat2_how(struct open_how *how) +{ } + static inline void audit_log_kern_module(char *name) { } -- cgit v1.2.3 From 5249001d69a223811ad654884c6115d55158fd51 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 1 Sep 2021 19:05:02 +0300 Subject: net/mlx5: Bridge, mark reg_c1 when pushing VLAN On ingress VLAN push also assign value 0x7FE to reg_c1 tunnel id+opts bits (tunnel id 0, which is not a valid tunnel id, and option 0x7FE which was reserved by one of previous patches in the series). In following patch the reg value is matched on egress miss to restore the packet to its original state by removing the VLAN before passing it to the software data path. Signed-off-by: Vlad Buslov Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eswitch.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 4ab5c1fc1270..97afcea39a7b 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -130,11 +130,20 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, #define ESW_TUN_OPTS_MASK GENMASK(31 - ESW_TUN_ID_BITS - ESW_RESERVED_BITS, ESW_TUN_OPTS_OFFSET) #define ESW_TUN_MASK GENMASK(31 - ESW_RESERVED_BITS, ESW_TUN_OFFSET) #define ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT 0 /* 0 is not a valid tunnel id */ +#define ESW_TUN_ID_BRIDGE_INGRESS_PUSH_VLAN ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT /* 0x7FF is a reserved mapping */ #define ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT GENMASK(ESW_TUN_OPTS_BITS - 1, 0) #define ESW_TUN_SLOW_TABLE_GOTO_VPORT ((ESW_TUN_ID_SLOW_TABLE_GOTO_VPORT << ESW_TUN_OPTS_BITS) | \ ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT) #define ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK ESW_TUN_OPTS_MASK +/* 0x7FE is a reserved mapping for bridge ingress push vlan mark */ +#define ESW_TUN_OPTS_BRIDGE_INGRESS_PUSH_VLAN (ESW_TUN_OPTS_SLOW_TABLE_GOTO_VPORT - 1) +#define ESW_TUN_BRIDGE_INGRESS_PUSH_VLAN ((ESW_TUN_ID_BRIDGE_INGRESS_PUSH_VLAN << \ + ESW_TUN_OPTS_BITS) | \ + ESW_TUN_OPTS_BRIDGE_INGRESS_PUSH_VLAN) +#define ESW_TUN_BRIDGE_INGRESS_PUSH_VLAN_MARK \ + GENMASK(31 - ESW_TUN_ID_BITS - ESW_RESERVED_BITS, \ + ESW_TUN_OPTS_OFFSET + 1) u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev); u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); -- cgit v1.2.3 From 3663ad34bc707fc85492f4d83a313f5df84718d4 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 19 Aug 2021 16:18:57 +0300 Subject: net/mlx5: Shift control IRQ to the last index Control IRQ is the first IRQ vector. This complicates handling of completion irqs as we need to offset them by one. in the next patch, there are scenarios where completion and control EQs will share the same irq. for example: functions with single IRQ. To ease such scenarios, we shift control IRQ to the end of the irq array. Signed-off-by: Shay Drory Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e23417424373..0ca719c00824 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -59,6 +59,8 @@ #define MLX5_ADEV_NAME "mlx5_core" +#define MLX5_IRQ_EQ_CTRL (U8_MAX) + enum { MLX5_BOARD_ID_LEN = 64, }; -- cgit v1.2.3 From f891b7cdbdcda116fd26bbd706f91bd58567aa17 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 1 Aug 2021 12:08:49 +0300 Subject: net/mlx5: Enable single IRQ for PCI Function Prior to this patch the driver requires two IRQs to function properly, one required IRQ for control and at least one required IRQ for IO. This requirement can be relaxed to one as the driver now allows sharing of IRQs, so control and IO EQs can share the same irq. This is needed for high scale amount of VFs. Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h index cea6ecb4b73e..ea3ff5a8ced3 100644 --- a/include/linux/mlx5/eq.h +++ b/include/linux/mlx5/eq.h @@ -4,7 +4,6 @@ #ifndef MLX5_CORE_EQ_H #define MLX5_CORE_EQ_H -#define MLX5_IRQ_VEC_COMP_BASE 1 #define MLX5_NUM_CMD_EQE (32) #define MLX5_NUM_ASYNC_EQE (0x1000) #define MLX5_NUM_SPARE_EQE (0x80) -- cgit v1.2.3 From e68ce0faf29c7c268666e11e95bf27dca97d28b0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 2 Sep 2021 14:47:48 +0200 Subject: mfd: hi6421-spmi-pmic: Cleanup drvdata to only include regmap There are lots of fields in struct hi6421_spmi_pmic that aren't used. As a matter of fact, only regmap is needed. So, drop the struct as a whole, and set regmap as the drvdata. Signed-off-by: Mauro Carvalho Chehab Acked-by: Mark Brown Acked-by: Greg Kroah-Hartman Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/1828cb783b1ebca0b98bf0b3077d8701adb228f7.1630586862.git.mchehab+huawei@kernel.org --- include/linux/mfd/hi6421-spmi-pmic.h | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 include/linux/mfd/hi6421-spmi-pmic.h (limited to 'include/linux') diff --git a/include/linux/mfd/hi6421-spmi-pmic.h b/include/linux/mfd/hi6421-spmi-pmic.h deleted file mode 100644 index e5b8dbf828b6..000000000000 --- a/include/linux/mfd/hi6421-spmi-pmic.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Header file for device driver Hi6421 PMIC - * - * Copyright (c) 2013 Linaro Ltd. - * Copyright (C) 2011 Hisilicon. - * Copyright (c) 2020-2021 Huawei Technologies Co., Ltd - * - * Guodong Xu - */ - -#ifndef __HISI_PMIC_H -#define __HISI_PMIC_H - -#include -#include - -struct hi6421_spmi_pmic { - struct resource *res; - struct device *dev; - void __iomem *regs; - struct regmap *regmap; -}; - -#endif /* __HISI_PMIC_H */ -- cgit v1.2.3 From c1baf6c591e6901d3422d7a0d0d32ccf29883edf Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Sun, 12 Sep 2021 21:17:15 +0300 Subject: usb: phy: tegra: Support OTG mode programming Support programming USB PHY into OTG mode. Signed-off-by: Dmitry Osipenko Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20210912181718.1328-5-digetx@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tegra_usb_phy.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h index fd1c9f6a4e37..d3e65eb9e16f 100644 --- a/include/linux/usb/tegra_usb_phy.h +++ b/include/linux/usb/tegra_usb_phy.h @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -30,6 +31,7 @@ * enter host mode * requires_extra_tuning_parameters: true if xcvr_hsslew, hssquelch_level * and hsdiscon_level should be set for adequate signal quality + * requires_pmc_ao_power_up: true if USB AO is powered down by default */ struct tegra_phy_soc_config { @@ -37,6 +39,7 @@ struct tegra_phy_soc_config { bool has_hostpc; bool requires_usbmode_setup; bool requires_extra_tuning_parameters; + bool requires_pmc_ao_power_up; }; struct tegra_utmip_config { @@ -62,6 +65,7 @@ enum tegra_usb_phy_port_speed { struct tegra_xtal_freq; struct tegra_usb_phy { + int irq; int instance; const struct tegra_xtal_freq *freq; void __iomem *regs; @@ -70,6 +74,7 @@ struct tegra_usb_phy { struct clk *pll_u; struct clk *pad_clk; struct regulator *vbus; + struct regmap *pmc_regmap; enum usb_dr_mode mode; void *config; const struct tegra_phy_soc_config *soc_config; -- cgit v1.2.3 From 3f6cffb8604b537e3d7ea040d7f4368689638eaf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Oct 2021 16:01:40 -0700 Subject: etherdevice: use __dev_addr_set() Andrew points out that eth_hw_addr_set() replaces memcpy() calls so we can't use ether_addr_copy() which assumes both arguments are 2-bytes aligned. Reported-by: Andrew Lunn Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 928c411bd509..c58d50451485 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -308,7 +308,7 @@ static inline void ether_addr_copy(u8 *dst, const u8 *src) */ static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) { - ether_addr_copy(dev->dev_addr, addr); + __dev_addr_set(dev, addr, ETH_ALEN); } /** -- cgit v1.2.3 From 1e4071f6282b3323435b02b1719bcfbfe1b57150 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 24 Sep 2021 07:12:42 -0500 Subject: ipmi: Export ipmb_checksum() It will be needed by the upcoming ipmb direct addressing. Signed-off-by: Corey Minyard Tested-by: Andrew Manley Reviewed-by: Andrew Manley --- include/linux/ipmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index 52850a02a3d0..163831a087ef 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -335,4 +335,7 @@ extern int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data); #define GET_DEVICE_ID_MAX_RETRY 5 +/* Helper function for computing the IPMB checksum of some data. */ +unsigned char ipmb_checksum(unsigned char *data, int size); + #endif /* __LINUX_IPMI_H */ -- cgit v1.2.3 From 059747c245f0e9af5e109eece7d3414dbe08d513 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Fri, 24 Sep 2021 11:42:56 -0500 Subject: ipmi: Add support for IPMB direct messages An application has come up that has a device sitting right on the IPMB that would like to communicate with the BMC on the IPMB using normal IPMI commands. Sending these commands and handling the responses is easy enough, no modifications are needed to the IPMI infrastructure. But if this is an application that also needs to receive IPMB commands and respond, some way is needed to handle these incoming commands and send the responses. Currently, the IPMI message handler only sends commands to the interface and only receives responses from interface. This change extends the interface to receive commands/responses and send commands/responses. These are formatted differently in support of receiving/sending IPMB messages directly. Signed-off-by: Corey Minyard Tested-by: Andrew Manley Reviewed-by: Andrew Manley --- include/linux/ipmi_smi.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index deec18b8944a..9277d21c2690 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -38,6 +38,59 @@ struct ipmi_smi; #define IPMI_WATCH_MASK_CHECK_WATCHDOG (1 << 1) #define IPMI_WATCH_MASK_CHECK_COMMANDS (1 << 2) +/* + * SMI messages + * + * When communicating with an SMI, messages come in two formats: + * + * * Normal (to a BMC over a BMC interface) + * + * * IPMB (over a IPMB to another MC) + * + * When normal, commands are sent using the format defined by a + * standard message over KCS (NetFn must be even): + * + * +-----------+-----+------+ + * | NetFn/LUN | Cmd | Data | + * +-----------+-----+------+ + * + * And responses, similarly, with an completion code added (NetFn must + * be odd): + * + * +-----------+-----+------+------+ + * | NetFn/LUN | Cmd | CC | Data | + * +-----------+-----+------+------+ + * + * With normal messages, only commands are sent and only responses are + * received. + * + * In IPMB mode, we are acting as an IPMB device. Commands will be in + * the following format (NetFn must be even): + * + * +-------------+------+-------------+-----+------+ + * | NetFn/rsLUN | Addr | rqSeq/rqLUN | Cmd | Data | + * +-------------+------+-------------+-----+------+ + * + * Responses will using the following format: + * + * +-------------+------+-------------+-----+------+------+ + * | NetFn/rqLUN | Addr | rqSeq/rsLUN | Cmd | CC | Data | + * +-------------+------+-------------+-----+------+------+ + * + * This is similar to the format defined in the IPMB manual section + * 2.11.1 with the checksums and the first address removed. Also, the + * address is always the remote address. + * + * IPMB messages can be commands and responses in both directions. + * Received commands are handled as received commands from the message + * queue. + */ + +enum ipmi_smi_msg_type { + IPMI_SMI_MSG_TYPE_NORMAL = 0, + IPMI_SMI_MSG_TYPE_IPMB_DIRECT +}; + /* * Messages to/from the lower layer. The smi interface will take one * of these to send. After the send has occurred and a response has @@ -54,6 +107,8 @@ struct ipmi_smi; struct ipmi_smi_msg { struct list_head link; + enum ipmi_smi_msg_type type; + long msgid; void *user_data; @@ -73,6 +128,10 @@ struct ipmi_smi_msg { struct ipmi_smi_handlers { struct module *owner; + /* Capabilities of the SMI. */ +#define IPMI_SMI_CAN_HANDLE_IPMB_DIRECT (1 << 0) + unsigned int flags; + /* * The low-level interface cannot start sending messages to * the upper layer until this function is called. This may -- cgit v1.2.3 From 549017aa1bb7ec19a1e24e7f65480a1c2e76b90e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 5 Oct 2021 13:52:42 +0200 Subject: netlink: remove netlink_broadcast_filtered No users in tree since commit a3498436b3a0 ("netns: restrict uevents"), so remove this functionality. Cc: Christian Brauner Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netlink.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 61b1c7fcc401..1ec631838af9 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -156,10 +156,6 @@ bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); -int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, - __u32 portid, __u32 group, gfp_t allocation, - int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data), - void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); -- cgit v1.2.3 From ded6e16b37e4c8c86cda98604ecd78818d6ca36a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Oct 2021 12:14:43 -0700 Subject: mlx4: replace mlx4_mac_to_u64() with ether_addr_to_u64() mlx4_mac_to_u64() predates and opencodes ether_addr_to_u64(). It doesn't make the argument constant so it'll be problematic when dev->dev_addr becomes a const. Convert to the generic helper. Signed-off-by: Jakub Kicinski Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx4/driver.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index a858bcb6220b..b26b71f62fb4 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -92,18 +92,6 @@ void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int struct devlink_port *mlx4_get_devlink_port(struct mlx4_dev *dev, int port); -static inline u64 mlx4_mac_to_u64(u8 *addr) -{ - u64 mac = 0; - int i; - - for (i = 0; i < ETH_ALEN; i++) { - mac <<= 8; - mac |= addr[i]; - } - return mac; -} - static inline void mlx4_u64_to_mac(u8 *addr, u64 mac) { int i; -- cgit v1.2.3 From 1bb96a07f9a8f2fe9725f6689605e32b75d20508 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Oct 2021 12:14:44 -0700 Subject: mlx4: replace mlx4_u64_to_mac() with u64_to_ether_addr() mlx4_u64_to_mac() predates the common helper but doesn't make the argument constant. Signed-off-by: Jakub Kicinski Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx4/driver.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index b26b71f62fb4..1834c8fad12e 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -92,14 +92,4 @@ void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int struct devlink_port *mlx4_get_devlink_port(struct mlx4_dev *dev, int port); -static inline void mlx4_u64_to_mac(u8 *addr, u64 mac) -{ - int i; - - for (i = ETH_ALEN; i > 0; i--) { - addr[i - 1] = mac & 0xFF; - mac >>= 8; - } -} - #endif /* MLX4_DRIVER_H */ -- cgit v1.2.3 From ebb1fdb589bd6d0ee647d4fa285bc934ba369cde Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Oct 2021 12:14:46 -0700 Subject: mlx4: constify args for const dev_addr netdev->dev_addr will become const soon. Make sure all functions which pass it around mark appropriate args as const. Signed-off-by: Jakub Kicinski Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 30bb59fe970c..6646634a0b9d 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1436,7 +1436,7 @@ int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id); int mlx4_hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id); -int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr, +int mlx4_tunnel_steer_add(struct mlx4_dev *dev, const unsigned char *addr, int port, int qpn, u16 prio, u64 *reg_id); void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, -- cgit v1.2.3 From 027b57170bf8bb6999a28e4a5f3d78bf1db0f90c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sat, 2 Oct 2021 15:09:00 +0200 Subject: serial: core: Fix initializing and restoring termios speed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit edc6afc54968 ("tty: switch to ktermios and new framework") termios speed is no longer stored only in c_cflag member but also in new additional c_ispeed and c_ospeed members. If BOTHER flag is set in c_cflag then termios speed is stored only in these new members. Therefore to correctly restore termios speed it is required to store also ispeed and ospeed members, not only cflag member. In case only cflag member with BOTHER flag is restored then functions tty_termios_baud_rate() and tty_termios_input_baud_rate() returns baudrate stored in c_ospeed / c_ispeed member, which is zero as it was not restored too. If reported baudrate is invalid (e.g. zero) then serial core functions report fallback baudrate value 9600. So it means that in this case original baudrate is lost and kernel changes it to value 9600. Simple reproducer of this issue is to boot kernel with following command line argument: "console=ttyXXX,86400" (where ttyXXX is the device name). For speed 86400 there is no Bnnn constant and therefore kernel has to represent this speed via BOTHER c_cflag. Which means that speed is stored only in c_ospeed and c_ispeed members, not in c_cflag anymore. If bootloader correctly configures serial device to speed 86400 then kernel prints boot log to early console at speed speed 86400 without any issue. But after kernel starts initializing real console device ttyXXX then speed is changed to fallback value 9600 because information about speed was lost. This patch fixes above issue by storing and restoring also ispeed and ospeed members, which are required for BOTHER flag. Fixes: edc6afc54968 ("[PATCH] tty: switch to ktermios and new framework") Cc: stable@vger.kernel.org Signed-off-by: Pali RohĂĄr Link: https://lore.kernel.org/r/20211002130900.9518-1-pali@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 20874db50bc8..a97f277cfdfa 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -149,6 +149,8 @@ struct console { short flags; short index; int cflag; + uint ispeed; + uint ospeed; void *data; struct console *next; }; -- cgit v1.2.3 From 94ad8aacbc2d4908b052c8bdb5ae13bc702f77ea Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Sep 2021 16:40:50 +0200 Subject: ARM: omap1: move omap15xx local bus handling to usb.c Commit 38225f2ef2f4 ("ARM/omap1: switch to use dma_direct_set_offset for lbus DMA offsets") removed a lot of mach/memory.h, but left the USB offset handling split into arch/arm/mach-omap1/usb.c and drivers/usb/host/ohci-omap.c. This can cause a randconfig build warning that now fails the build with -Werror: arch/arm/mach-omap1/usb.c:561:30: error: 'omap_1510_usb_ohci_nb' defined but not used [-Werror=unused-variable] 561 | static struct notifier_block omap_1510_usb_ohci_nb = { | ^~~~~~~~~~~~~~~~~~~~~ Move it all into the platform file to get rid of the final location that relies on mach/memory.h. Acked-by: Felipe Balbi Acked-by: Alan Stern Acked-by: Tony Lindgren Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20210927144118.2464881-1-arnd@kernel.org' Signed-off-by: Arnd Bergmann --- include/linux/platform_data/usb-omap1.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/usb-omap1.h b/include/linux/platform_data/usb-omap1.h index 43b5ce139c37..878e572a78bf 100644 --- a/include/linux/platform_data/usb-omap1.h +++ b/include/linux/platform_data/usb-omap1.h @@ -48,6 +48,8 @@ struct omap_usb_config { u32 (*usb2_init)(unsigned nwires, unsigned alt_pingroup); int (*ocpi_enable)(void); + + void (*lb_reset)(void); }; #endif /* __LINUX_USB_OMAP1_H */ -- cgit v1.2.3 From a130e8fbc7de796eb6e680724d87f4737a26d0ac Mon Sep 17 00:00:00 2001 From: Josh Don Date: Fri, 27 Aug 2021 09:54:38 -0700 Subject: fs/proc/uptime.c: Fix idle time reporting in /proc/uptime /proc/uptime reports idle time by reading the CPUTIME_IDLE field from the per-cpu kcpustats. However, on NO_HZ systems, idle time is not continually updated on idle cpus, leading this value to appear incorrectly small. /proc/stat performs an accounting update when reading idle time; we can use the same approach for uptime. With this patch, /proc/stat and /proc/uptime now agree on idle time. Additionally, the following shows idle time tick up consistently on an idle machine: (while true; do cat /proc/uptime; sleep 1; done) | awk '{print $2-prev; prev=$2}' Reported-by: Luigi Rizzo Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Eric Dumazet Link: https://lkml.kernel.org/r/20210827165438.3280779-1-joshdon@google.com --- include/linux/kernel_stat.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44ae1a7eb9e3..69ae6b278464 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -102,6 +102,7 @@ extern void account_system_index_time(struct task_struct *, u64, enum cpu_usage_stat); extern void account_steal_time(u64); extern void account_idle_time(u64); +extern u64 get_idle_time(struct kernel_cpustat *kcs, int cpu); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline void account_process_tick(struct task_struct *tsk, int user) -- cgit v1.2.3 From ceeadb83aea28372e54857bf88ab7e17af48ab7b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:41 +0000 Subject: sched: Make struct sched_statistics independent of fair sched class If we want to use the schedstats facility to trace other sched classes, we should make it independent of fair sched class. The struct sched_statistics is the schedular statistics of a task_struct or a task_group. So we can move it into struct task_struct and struct task_group to achieve the goal. After the patch, schestats are orgnized as follows, struct task_struct { ... struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; ... struct sched_statistics stats; ... }; Regarding the task group, schedstats is only supported for fair group sched, and a new struct sched_entity_stats is introduced, suggested by Peter - struct sched_entity_stats { struct sched_entity se; struct sched_statistics stats; } __no_randomize_layout; Then with the se in a task_group, we can easily get the stats. The sched_statistics members may be frequently modified when schedstats is enabled, in order to avoid impacting on random data which may in the same cacheline with them, the struct sched_statistics is defined as cacheline aligned. As this patch changes the core struct of scheduler, so I verified the performance it may impact on the scheduler with 'perf bench sched pipe', suggested by Mel. Below is the result, in which all the values are in usecs/op. Before After kernel.sched_schedstats=0 5.2~5.4 5.2~5.4 kernel.sched_schedstats=1 5.3~5.5 5.3~5.5 [These data is a little difference with the earlier version, that is because my old test machine is destroyed so I have to use a new different test machine.] Almost no impact on the sched performance. No functional change. [lkp@intel.com: reported build failure in earlier version] Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com --- include/linux/sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c1a927ddec64..2bc4c72fec2d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -522,7 +522,7 @@ struct sched_statistics { u64 nr_wakeups_passive; u64 nr_wakeups_idle; #endif -}; +} ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ @@ -538,8 +538,6 @@ struct sched_entity { u64 nr_migrations; - struct sched_statistics statistics; - #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; @@ -803,6 +801,8 @@ struct task_struct { struct uclamp_se uclamp[UCLAMP_CNT]; #endif + struct sched_statistics stats; + #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; -- cgit v1.2.3 From 847fc0cd0664fcb2a08ac66df6b85935361ec454 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:43 +0000 Subject: sched: Introduce task block time in schedstats Currently in schedstats we have sum_sleep_runtime and iowait_sum, but there's no metric to show how long the task is in D state. Once a task in D state, it means the task is blocked in the kernel, for example the task may be waiting for a mutex. The D state is more frequent than iowait, and it is more critital than S state. So it is worth to add a metric to measure it. Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210905143547.4668-5-laoar.shao@gmail.com --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2bc4c72fec2d..193e16e2d0e4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -503,6 +503,8 @@ struct sched_statistics { u64 block_start; u64 block_max; + s64 sum_block_runtime; + u64 exec_max; u64 slice_max; -- cgit v1.2.3 From 8d491de6edc27138806cae6e8eca455beb325b62 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2021 14:24:32 +0200 Subject: sched: Move mmdrop to RCU on RT mmdrop() is invoked from finish_task_switch() by the incoming task to drop the mm which was handed over by the previous task. mmdrop() can be quite expensive which prevents an incoming real-time task from getting useful work done. Provide mmdrop_sched() which maps to mmdrop() on !RT kernels. On RT kernels it delagates the eventually required invocation of __mmdrop() to RCU. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210928122411.648582026@linutronix.de --- include/linux/mm_types.h | 4 ++++ include/linux/sched/mm.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..e9672de22cf2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -572,6 +573,9 @@ struct mm_struct { bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; +#ifdef CONFIG_PREEMPT_RT + struct rcu_head delayed_drop; +#endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 5561486fddef..aca874d33fe6 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,35 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +#ifdef CONFIG_PREEMPT_RT +/* + * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is + * by far the least expensive way to do that. + */ +static inline void __mmdrop_delayed(struct rcu_head *rhp) +{ + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); + + __mmdrop(mm); +} + +/* + * Invoked from finish_task_switch(). Delegates the heavy lifting on RT + * kernels via RCU. + */ +static inline void mmdrop_sched(struct mm_struct *mm) +{ + /* Provides a full memory barrier. See mmdrop() */ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +static inline void mmdrop_sched(struct mm_struct *mm) +{ + mmdrop(mm); +} +#endif + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. -- cgit v1.2.3 From c8ed99533dbc0fcc1142671ec80acb33045d2999 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:23 +0100 Subject: PM: EM: Mark inefficient states Some SoCs, such as the sd855 have OPPs within the same performance domain, whose cost is higher than others with a higher frequency. Even though those OPPs are interesting from a cooling perspective, it makes no sense to use them when the device can run at full capacity. Those OPPs handicap the performance domain, when choosing the most energy-efficient CPU and are wasting energy. They are inefficient. Hence, add support for such OPPs to the Energy Model. The table can now be read skipping inefficient performance states (and by extension, inefficient OPPs). Signed-off-by: Vincent Donnefort Reviewed-by: Matthias Kaehlcke Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 39dcadd492b5..3641ca4acf04 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -17,13 +17,25 @@ * device). It can be a total power: static and dynamic. * @cost: The cost coefficient associated with this level, used during * energy calculation. Equal to: power * max_frequency / frequency + * @flags: see "em_perf_state flags" description below. */ struct em_perf_state { unsigned long frequency; unsigned long power; unsigned long cost; + unsigned long flags; }; +/* + * em_perf_state flags: + * + * EM_PERF_STATE_INEFFICIENT: The performance state is inefficient. There is + * in this em_perf_domain, another performance state with a higher frequency + * but a lower or equal power cost. Such inefficient states are ignored when + * using em_pd_get_efficient_*() functions. + */ +#define EM_PERF_STATE_INEFFICIENT BIT(0) + /** * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order -- cgit v1.2.3 From 88f7a89560f6d0fc7803a8933637488f14e0a098 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:24 +0100 Subject: PM: EM: Extend em_perf_domain with a flag field Merge the current "milliwatts" option into a "flag" field. This intends to prepare the extension of this structure for inefficient states support in the Energy Model. Signed-off-by: Vincent Donnefort Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 3641ca4acf04..671440371a95 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -40,8 +40,7 @@ struct em_perf_state { * struct em_perf_domain - Performance domain * @table: List of performance states, in ascending order * @nr_perf_states: Number of performance states - * @milliwatts: Flag indicating the power values are in milli-Watts - * or some other scale. + * @flags: See "em_perf_domain flags" * @cpus: Cpumask covering the CPUs of the domain. It's here * for performance reasons to avoid potential cache * misses during energy calculations in the scheduler @@ -56,10 +55,18 @@ struct em_perf_state { struct em_perf_domain { struct em_perf_state *table; int nr_perf_states; - int milliwatts; + unsigned long flags; unsigned long cpus[]; }; +/* + * em_perf_domain flags: + * + * EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some + * other scale. + */ +#define EM_PERF_DOMAIN_MILLIWATTS BIT(0) + #define em_span_cpus(em) (to_cpumask((em)->cpus)) #ifdef CONFIG_ENERGY_MODEL -- cgit v1.2.3 From 8354eb9eb3ddb4a8d0857648a470beffcc9d8639 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:25 +0100 Subject: PM: EM: Allow skipping inefficient states The new performance domain flag EM_PERF_DOMAIN_SKIP_INEFFICIENCIES allows to not take into account inefficient states when estimating energy consumption. This intends to let the Energy Model know that CPUFreq itself will skip inefficiencies and such states don't need to be part of the estimation anymore. Signed-off-by: Vincent Donnefort Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 671440371a95..6377adc3b78d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -64,8 +64,12 @@ struct em_perf_domain { * * EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some * other scale. + * + * EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating + * energy consumption. */ #define EM_PERF_DOMAIN_MILLIWATTS BIT(0) +#define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1) #define em_span_cpus(em) (to_cpumask((em)->cpus)) @@ -120,6 +124,37 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, bool milliwatts); void em_dev_unregister_perf_domain(struct device *dev); +/** + * em_pd_get_efficient_state() - Get an efficient performance state from the EM + * @pd : Performance domain for which we want an efficient frequency + * @freq : Frequency to map with the EM + * + * It is called from the scheduler code quite frequently and as a consequence + * doesn't implement any check. + * + * Return: An efficient performance state, high enough to meet @freq + * requirement. + */ +static inline +struct em_perf_state *em_pd_get_efficient_state(struct em_perf_domain *pd, + unsigned long freq) +{ + struct em_perf_state *ps; + int i; + + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &pd->table[i]; + if (ps->frequency >= freq) { + if (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES && + ps->flags & EM_PERF_STATE_INEFFICIENT) + continue; + break; + } + } + + return ps; +} + /** * em_cpu_energy() - Estimates the energy consumed by the CPUs of a * performance domain @@ -142,7 +177,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, { unsigned long freq, scale_cpu; struct em_perf_state *ps; - int i, cpu; + int cpu; if (!sum_util) return 0; @@ -167,11 +202,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested frequency. */ - for (i = 0; i < pd->nr_perf_states; i++) { - ps = &pd->table[i]; - if (ps->frequency >= freq) - break; - } + ps = em_pd_get_efficient_state(pd, freq); /* * The capacity of a CPU in the domain at the performance state (ps) -- cgit v1.2.3 From 442d24a5c49a8ed1008dcb9c06dc463d12421e7f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:27 +0100 Subject: cpufreq: Add an interface to mark inefficient frequencies Some SoCs such as the sd855 have OPPs within the same policy whose cost is higher than others with a higher frequency. Those OPPs are inefficients and it might be interesting for a governor to not use them. cpufreq_table_set_inefficient() allows the caller to identify a specified frequency as being inefficient. Inefficient frequencies are only supported on sorted tables. Signed-off-by: Vincent Donnefort Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index ff88bb3e44fc..16997e1ff3fe 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -660,10 +660,11 @@ struct governor_attr { *********************************************************************/ /* Special Values of .frequency field */ -#define CPUFREQ_ENTRY_INVALID ~0u -#define CPUFREQ_TABLE_END ~1u +#define CPUFREQ_ENTRY_INVALID ~0u +#define CPUFREQ_TABLE_END ~1u /* Special Values of .flags field */ -#define CPUFREQ_BOOST_FREQ (1 << 0) +#define CPUFREQ_BOOST_FREQ (1 << 0) +#define CPUFREQ_INEFFICIENT_FREQ (1 << 1) struct cpufreq_frequency_table { unsigned int flags; @@ -1003,6 +1004,36 @@ static inline int cpufreq_table_count_valid_entries(const struct cpufreq_policy return count; } +/** + * cpufreq_table_set_inefficient() - Mark a frequency as inefficient + * @policy: the &struct cpufreq_policy containing the inefficient frequency + * @frequency: the inefficient frequency + * + * The &struct cpufreq_policy must use a sorted frequency table + * + * Return: %0 on success or a negative errno code + */ + +static inline int +cpufreq_table_set_inefficient(struct cpufreq_policy *policy, + unsigned int frequency) +{ + struct cpufreq_frequency_table *pos; + + /* Not supported */ + if (policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED) + return -EINVAL; + + cpufreq_for_each_valid_entry(pos, policy->freq_table) { + if (pos->frequency == frequency) { + pos->flags |= CPUFREQ_INEFFICIENT_FREQ; + return 0; + } + } + + return -EINVAL; +} + static inline int parse_perf_domain(int cpu, const char *list_name, const char *cell_name) { @@ -1071,6 +1102,13 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) return false; } +static inline int +cpufreq_table_set_inefficient(struct cpufreq_policy *policy, + unsigned int frequency) +{ + return -EINVAL; +} + static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_name, const char *cell_name, struct cpumask *cpumask) { -- cgit v1.2.3 From 1f39fa0dccff71d4788089b5e617229b19166867 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:28 +0100 Subject: cpufreq: Introducing CPUFREQ_RELATION_E This newly introduced flag can be applied by a governor to a CPUFreq relation, when looking for a frequency within the policy table. The resolution would then only walk through efficient frequencies. Even with the flag set, the policy max limit will still be honoured. If no efficient frequencies can be found within the limits of the policy, an inefficient one would be returned. Signed-off-by: Vincent Donnefort Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 111 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 16997e1ff3fe..6d96bd452d55 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -118,6 +118,13 @@ struct cpufreq_policy { */ bool strict_target; + /* + * Set if inefficient frequencies were found in the frequency table. + * This indicates if the relation flag CPUFREQ_RELATION_E can be + * honored. + */ + bool efficiencies_available; + /* * Preferred average time interval between consecutive invocations of * the driver to set the frequency for this policy. To be set by the @@ -273,6 +280,8 @@ static inline void cpufreq_stats_record_transition(struct cpufreq_policy *policy #define CPUFREQ_RELATION_L 0 /* lowest frequency at or above target */ #define CPUFREQ_RELATION_H 1 /* highest frequency below or at target */ #define CPUFREQ_RELATION_C 2 /* closest frequency to target */ +/* relation flags */ +#define CPUFREQ_RELATION_E BIT(2) /* Get if possible an efficient frequency */ struct freq_attr { struct attribute attr; @@ -741,6 +750,22 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev, continue; \ else +/** + * cpufreq_for_each_efficient_entry_idx - iterate with index over a cpufreq + * frequency_table excluding CPUFREQ_ENTRY_INVALID and + * CPUFREQ_INEFFICIENT_FREQ frequencies. + * @pos: the &struct cpufreq_frequency_table to use as a loop cursor. + * @table: the &struct cpufreq_frequency_table to iterate over. + * @idx: the table entry currently being processed. + * @efficiencies: set to true to only iterate over efficient frequencies. + */ + +#define cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) \ + cpufreq_for_each_valid_entry_idx(pos, table, idx) \ + if (efficiencies && (pos->flags & CPUFREQ_INEFFICIENT_FREQ)) \ + continue; \ + else + int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table); @@ -765,14 +790,15 @@ bool policy_has_boost_freq(struct cpufreq_policy *policy); /* Find lowest freq at or above target in a table in ascending order */ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq >= target_freq) @@ -786,14 +812,15 @@ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy, /* Find lowest freq at or above target in a table in descending order */ static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -816,26 +843,30 @@ static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy, /* Works only on sorted freq-tables */ static inline int cpufreq_table_find_index_l(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { target_freq = clamp_val(target_freq, policy->min, policy->max); if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING) - return cpufreq_table_find_index_al(policy, target_freq); + return cpufreq_table_find_index_al(policy, target_freq, + efficiencies); else - return cpufreq_table_find_index_dl(policy, target_freq); + return cpufreq_table_find_index_dl(policy, target_freq, + efficiencies); } /* Find highest freq at or below target in a table in ascending order */ static inline int cpufreq_table_find_index_ah(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -858,14 +889,15 @@ static inline int cpufreq_table_find_index_ah(struct cpufreq_policy *policy, /* Find highest freq at or below target in a table in descending order */ static inline int cpufreq_table_find_index_dh(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq <= target_freq) @@ -879,26 +911,30 @@ static inline int cpufreq_table_find_index_dh(struct cpufreq_policy *policy, /* Works only on sorted freq-tables */ static inline int cpufreq_table_find_index_h(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { target_freq = clamp_val(target_freq, policy->min, policy->max); if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING) - return cpufreq_table_find_index_ah(policy, target_freq); + return cpufreq_table_find_index_ah(policy, target_freq, + efficiencies); else - return cpufreq_table_find_index_dh(policy, target_freq); + return cpufreq_table_find_index_dh(policy, target_freq, + efficiencies); } /* Find closest freq to target in a table in ascending order */ static inline int cpufreq_table_find_index_ac(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -925,14 +961,15 @@ static inline int cpufreq_table_find_index_ac(struct cpufreq_policy *policy, /* Find closest freq to target in a table in descending order */ static inline int cpufreq_table_find_index_dc(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { struct cpufreq_frequency_table *table = policy->freq_table; struct cpufreq_frequency_table *pos; unsigned int freq; int idx, best = -1; - cpufreq_for_each_valid_entry_idx(pos, table, idx) { + cpufreq_for_each_efficient_entry_idx(pos, table, idx, efficiencies) { freq = pos->frequency; if (freq == target_freq) @@ -959,35 +996,58 @@ static inline int cpufreq_table_find_index_dc(struct cpufreq_policy *policy, /* Works only on sorted freq-tables */ static inline int cpufreq_table_find_index_c(struct cpufreq_policy *policy, - unsigned int target_freq) + unsigned int target_freq, + bool efficiencies) { target_freq = clamp_val(target_freq, policy->min, policy->max); if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING) - return cpufreq_table_find_index_ac(policy, target_freq); + return cpufreq_table_find_index_ac(policy, target_freq, + efficiencies); else - return cpufreq_table_find_index_dc(policy, target_freq); + return cpufreq_table_find_index_dc(policy, target_freq, + efficiencies); } static inline int cpufreq_frequency_table_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { + bool efficiencies = policy->efficiencies_available && + (relation & CPUFREQ_RELATION_E); + int idx; + + /* cpufreq_table_index_unsorted() has no use for this flag anyway */ + relation &= ~CPUFREQ_RELATION_E; + if (unlikely(policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED)) return cpufreq_table_index_unsorted(policy, target_freq, relation); - +retry: switch (relation) { case CPUFREQ_RELATION_L: - return cpufreq_table_find_index_l(policy, target_freq); + idx = cpufreq_table_find_index_l(policy, target_freq, + efficiencies); + break; case CPUFREQ_RELATION_H: - return cpufreq_table_find_index_h(policy, target_freq); + idx = cpufreq_table_find_index_h(policy, target_freq, + efficiencies); + break; case CPUFREQ_RELATION_C: - return cpufreq_table_find_index_c(policy, target_freq); + idx = cpufreq_table_find_index_c(policy, target_freq, + efficiencies); + break; default: WARN_ON_ONCE(1); return 0; } + + if (idx < 0 && efficiencies) { + efficiencies = false; + goto retry; + } + + return idx; } static inline int cpufreq_table_count_valid_entries(const struct cpufreq_policy *policy) @@ -1027,6 +1087,7 @@ cpufreq_table_set_inefficient(struct cpufreq_policy *policy, cpufreq_for_each_valid_entry(pos, policy->freq_table) { if (pos->frequency == frequency) { pos->flags |= CPUFREQ_INEFFICIENT_FREQ; + policy->efficiencies_available = true; return 0; } } -- cgit v1.2.3 From b894d20e6867f04827c7817fbc155460ff108f6f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 8 Sep 2021 15:05:29 +0100 Subject: cpufreq: Use CPUFREQ_RELATION_E in DVFS governors Let the governors schedutil, conservative and ondemand to work, if possible on efficient frequencies only. Signed-off-by: Vincent Donnefort Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 6d96bd452d55..7ce71c7371fb 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -283,6 +283,10 @@ static inline void cpufreq_stats_record_transition(struct cpufreq_policy *policy /* relation flags */ #define CPUFREQ_RELATION_E BIT(2) /* Get if possible an efficient frequency */ +#define CPUFREQ_RELATION_LE (CPUFREQ_RELATION_L | CPUFREQ_RELATION_E) +#define CPUFREQ_RELATION_HE (CPUFREQ_RELATION_H | CPUFREQ_RELATION_E) +#define CPUFREQ_RELATION_CE (CPUFREQ_RELATION_C | CPUFREQ_RELATION_E) + struct freq_attr { struct attribute attr; ssize_t (*show)(struct cpufreq_policy *, char *); @@ -636,9 +640,11 @@ struct cpufreq_governor *cpufreq_fallback_governor(void); static inline void cpufreq_policy_apply_limits(struct cpufreq_policy *policy) { if (policy->max < policy->cur) - __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); + __cpufreq_driver_target(policy, policy->max, + CPUFREQ_RELATION_HE); else if (policy->min > policy->cur) - __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); + __cpufreq_driver_target(policy, policy->min, + CPUFREQ_RELATION_LE); } /* Governor attribute set */ -- cgit v1.2.3 From 4ad81f6ef89b62434f1d2ed26e9bec9d0e3d9dfe Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 4 Oct 2021 22:06:41 +0200 Subject: clk: tegra: Add stubs needed for compile testing These stubs are needed to allow the tegra-cpuidle driver to be compile-tested. Signed-off-by: Thierry Reding --- include/linux/clk/tegra.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h index d128ad1570aa..3650e926e93f 100644 --- a/include/linux/clk/tegra.h +++ b/include/linux/clk/tegra.h @@ -42,6 +42,7 @@ struct tegra_cpu_car_ops { #endif }; +#ifdef CONFIG_ARCH_TEGRA extern struct tegra_cpu_car_ops *tegra_cpu_car_ops; static inline void tegra_wait_cpu_in_reset(u32 cpu) @@ -83,8 +84,29 @@ static inline void tegra_disable_cpu_clock(u32 cpu) tegra_cpu_car_ops->disable_clock(cpu); } +#else +static inline void tegra_wait_cpu_in_reset(u32 cpu) +{ +} -#ifdef CONFIG_PM_SLEEP +static inline void tegra_put_cpu_in_reset(u32 cpu) +{ +} + +static inline void tegra_cpu_out_of_reset(u32 cpu) +{ +} + +static inline void tegra_enable_cpu_clock(u32 cpu) +{ +} + +static inline void tegra_disable_cpu_clock(u32 cpu) +{ +} +#endif + +#if defined(CONFIG_ARCH_TEGRA) && defined(CONFIG_PM_SLEEP) static inline bool tegra_cpu_rail_off_ready(void) { if (WARN_ON(!tegra_cpu_car_ops->rail_off_ready)) -- cgit v1.2.3 From 2357672c54c3f748f675446f8eba8b0432b1e7e2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 2 Oct 2021 06:47:49 +0530 Subject: bpf: Introduce BPF support for kernel module function calls This change adds support on the kernel side to allow for BPF programs to call kernel module functions. Userspace will prepare an array of module BTF fds that is passed in during BPF_PROG_LOAD using fd_array parameter. In the kernel, the module BTFs are placed in the auxilliary struct for bpf_prog, and loaded as needed. The verifier then uses insn->off to index into the fd_array. insn->off 0 is reserved for vmlinux BTF (for backwards compat), so userspace must use an fd_array index > 0 for module kfunc support. kfunc_btf_tab is sorted based on offset in an array, and each offset corresponds to one descriptor, with a max limit up to 256 such module BTFs. We also change existing kfunc_tab to distinguish each element based on imm, off pair as each such call will now be distinct. Another change is to check_kfunc_call callback, which now include a struct module * pointer, this is to be used in later patch such that the kfunc_id and module pointer are matched for dynamically registered BTF sets from loadable modules, so that same kfunc_id in two modules doesn't lead to check_kfunc_call succeeding. For the duration of the check_kfunc_call, the reference to struct module exists, as it returns the pointer stored in kfunc_btf_tab. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211002011757.311265-2-memxor@gmail.com --- include/linux/bpf.h | 8 +++++--- include/linux/bpf_verifier.h | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1c7fd7c4c6d3..d604c8251d88 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -513,7 +513,7 @@ struct bpf_verifier_ops { const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); - bool (*check_kfunc_call)(u32 kfunc_btf_id); + bool (*check_kfunc_call)(u32 kfunc_btf_id, struct module *owner); }; struct bpf_prog_offload_ops { @@ -877,6 +877,7 @@ struct bpf_prog_aux { void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; struct bpf_kfunc_desc_tab *kfunc_tab; + struct bpf_kfunc_btf_tab *kfunc_btf_tab; u32 size_poke_tab; struct bpf_ksym ksym; const struct bpf_prog_ops *ops; @@ -1639,7 +1640,7 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); -bool bpf_prog_test_check_kfunc_call(u32 kfunc_id); +bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1860,7 +1861,8 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, return -ENOTSUPP; } -static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id) +static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, + struct module *owner) { return false; } diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5424124dbe36..c8a78e830fca 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -527,5 +527,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *tgt_prog, u32 btf_id, struct bpf_attach_target_info *tgt_info); +void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From 14f267d95fe4b08831a022c8e15a2eb8991edbf6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 2 Oct 2021 06:47:51 +0530 Subject: bpf: btf: Introduce helpers for dynamic BTF set registration This adds helpers for registering btf_id_set from modules and the bpf_check_mod_kfunc_call callback that can be used to look them up. With in kernel sets, the way this is supposed to work is, in kernel callback looks up within the in-kernel kfunc whitelist, and then defers to the dynamic BTF set lookup if it doesn't find the BTF id. If there is no in-kernel BTF id set, this callback can be used directly. Also fix includes for btf.h and bpfptr.h so that they can included in isolation. This is in preparation for their usage in tcp_bbr, tcp_cubic and tcp_dctcp modules in the next patch. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211002011757.311265-4-memxor@gmail.com --- include/linux/bpfptr.h | 1 + include/linux/btf.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h index 546e27fc6d46..46e1757d06a3 100644 --- a/include/linux/bpfptr.h +++ b/include/linux/bpfptr.h @@ -3,6 +3,7 @@ #ifndef _LINUX_BPFPTR_H #define _LINUX_BPFPTR_H +#include #include typedef sockptr_t bpfptr_t; diff --git a/include/linux/btf.h b/include/linux/btf.h index 214fde93214b..6c4c61d821d7 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -5,6 +5,7 @@ #define _LINUX_BTF_H 1 #include +#include #include #include @@ -238,4 +239,39 @@ static inline const char *btf_name_by_offset(const struct btf *btf, } #endif +struct kfunc_btf_id_set { + struct list_head list; + struct btf_id_set *set; + struct module *owner; +}; + +struct kfunc_btf_id_list; + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES +void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, + struct kfunc_btf_id_set *s); +void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, + struct kfunc_btf_id_set *s); +bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, + struct module *owner); +#else +static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, + struct kfunc_btf_id_set *s) +{ +} +static inline void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, + struct kfunc_btf_id_set *s) +{ +} +static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, + u32 kfunc_id, struct module *owner) +{ + return false; +} +#endif + +#define DEFINE_KFUNC_BTF_ID_SET(set, name) \ + struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set), \ + THIS_MODULE } + #endif -- cgit v1.2.3 From 0e32dfc80bae53b05e9eda7eaf259f30ab9ba43a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 2 Oct 2021 06:47:53 +0530 Subject: bpf: Enable TCP congestion control kfunc from modules This commit moves BTF ID lookup into the newly added registration helper, in a way that the bbr, cubic, and dctcp implementation set up their sets in the bpf_tcp_ca kfunc_btf_set list, while the ones not dependent on modules are looked up from the wrapper function. This lifts the restriction for them to be compiled as built in objects, and can be loaded as modules if required. Also modify Makefile.modfinal to call resolve_btfids for each module. Note that since kernel kfunc_ids never overlap with module kfunc_ids, we only match the owner for module btf id sets. See following commits for background on use of: CONFIG_X86 ifdef: 569c484f9995 (bpf: Limit static tcp-cc functions in the .BTF_ids list to x86) CONFIG_DYNAMIC_FTRACE ifdef: 7aae231ac93b (bpf: tcp: Limit calling some tcp cc functions to CONFIG_DYNAMIC_FTRACE) Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211002011757.311265-6-memxor@gmail.com --- include/linux/btf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 6c4c61d821d7..1d56cd2bb362 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -274,4 +274,6 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set), \ THIS_MODULE } +extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; + #endif -- cgit v1.2.3 From c48e51c8b07aba8a18125221cb67a40cb1256bf2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 2 Oct 2021 06:47:57 +0530 Subject: bpf: selftests: Add selftests for module kfunc support This adds selftests that tests the success and failure path for modules kfuncs (in presence of invalid kfunc calls) for both libbpf and gen_loader. It also adds a prog_test kfunc_btf_id_list so that we can add module BTF ID set from bpf_testmod. This also introduces a couple of test cases to verifier selftests for validating whether we get an error or not depending on if invalid kfunc call remains after elimination of unreachable instructions. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211002011757.311265-10-memxor@gmail.com --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 1d56cd2bb362..203eef993d76 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -275,5 +275,6 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, THIS_MODULE } extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; +extern struct kfunc_btf_id_list prog_test_kfunc_list; #endif -- cgit v1.2.3 From 4f627af8e6068892cafe031df6c14e8a0aaaa426 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 2 Sep 2021 16:10:11 -0500 Subject: ptrace: Remove the unnecessary arguments from arch_ptrace_stop Both arch_ptrace_stop_needed and arch_ptrace_stop are called with an exit_code and a siginfo structure. Neither argument is used by any of the implementations so just remove the unneeded arguments. The two arechitectures that implement arch_ptrace_stop are ia64 and sparc. Both architectures flush their register stacks before a ptrace_stack so that all of the register information can be accessed by debuggers. As the question of if a register stack needs to be flushed is independent of why ptrace is stopping not needing arguments make sense. Cc: David Miller Cc: sparclinux@vger.kernel.org Link: https://lkml.kernel.org/r/87lf3mx290.fsf@disp2133 Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- include/linux/ptrace.h | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index b5ebf6c01292..8aee2945ff08 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -362,29 +362,25 @@ static inline void user_single_step_report(struct pt_regs *regs) #ifndef arch_ptrace_stop_needed /** * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called - * @code: current->exit_code value ptrace will stop with - * @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with * * This is called with the siglock held, to decide whether or not it's - * necessary to release the siglock and call arch_ptrace_stop() with the - * same @code and @info arguments. It can be defined to a constant if - * arch_ptrace_stop() is never required, or always is. On machines where - * this makes sense, it should be defined to a quick test to optimize out - * calling arch_ptrace_stop() when it would be superfluous. For example, - * if the thread has not been back to user mode since the last stop, the - * thread state might indicate that nothing needs to be done. + * necessary to release the siglock and call arch_ptrace_stop(). It can be + * defined to a constant if arch_ptrace_stop() is never required, or always + * is. On machines where this makes sense, it should be defined to a quick + * test to optimize out calling arch_ptrace_stop() when it would be + * superfluous. For example, if the thread has not been back to user mode + * since the last stop, the thread state might indicate that nothing needs + * to be done. * * This is guaranteed to be invoked once before a task stops for ptrace and * may include arch-specific operations necessary prior to a ptrace stop. */ -#define arch_ptrace_stop_needed(code, info) (0) +#define arch_ptrace_stop_needed() (0) #endif #ifndef arch_ptrace_stop /** * arch_ptrace_stop - Do machine-specific work before stopping for ptrace - * @code: current->exit_code value ptrace will stop with - * @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with * * This is called with no locks held when arch_ptrace_stop_needed() has * just returned nonzero. It is allowed to block, e.g. for user memory @@ -394,7 +390,7 @@ static inline void user_single_step_report(struct pt_regs *regs) * we only do it when the arch requires it for this particular stop, as * indicated by arch_ptrace_stop_needed(). */ -#define arch_ptrace_stop(code, info) do { } while (0) +#define arch_ptrace_stop() do { } while (0) #endif #ifndef current_pt_regs -- cgit v1.2.3 From 92307383082daff5df884a25df9e283efb7ef261 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Sep 2021 11:33:50 -0500 Subject: coredump: Don't perform any cleanups before dumping core Rename coredump_exit_mm to coredump_task_exit and call it from do_exit before PTRACE_EVENT_EXIT, and before any cleanup work for a task happens. This ensures that an accurate copy of the process can be captured in the coredump as no cleanup for the process happens before the coredump completes. This also ensures that PTRACE_EVENT_EXIT will not be visited by any thread until the coredump is complete. Add a new flag PF_POSTCOREDUMP so that tasks that have passed through coredump_task_exit can be recognized and ignored in zap_process. Now that all of the coredumping happens before exit_mm remove code to test for a coredump in progress from mm_release. Replace "may_ptrace_stop()" with a simple test of "current->ptrace". The other tests in may_ptrace_stop all concern avoiding stopping during a coredump. These tests are no longer necessary as it is now guaranteed that fatal_signal_pending will be set if the code enters ptrace_stop during a coredump. The code in ptrace_stop is guaranteed not to stop if fatal_signal_pending returns true. Until this change "ptrace_event(PTRACE_EVENT_EXIT)" could call ptrace_stop without fatal_signal_pending being true, as signals are dequeued in get_signal before calling do_exit. This is no longer an issue as "ptrace_event(PTRACE_EVENT_EXIT)" is no longer reached until after the coredump completes. Link: https://lkml.kernel.org/r/874kaax26c.fsf@disp2133 Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index e12b524426b0..f3741f23935e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1664,6 +1664,7 @@ extern struct pid *cad_pid; #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ +#define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ -- cgit v1.2.3 From 353407d917b2d87cd8104a0453d012439c6ca4be Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 6 Oct 2021 13:46:42 +0300 Subject: ethtool: Add ability to control transceiver modules' power mode Add a pair of new ethtool messages, 'ETHTOOL_MSG_MODULE_SET' and 'ETHTOOL_MSG_MODULE_GET', that can be used to control transceiver modules parameters and retrieve their status. The first parameter to control is the power mode of the module. It is only relevant for paged memory modules, as flat memory modules always operate in low power mode. When a paged memory module is in low power mode, its power consumption is reduced to the minimum, the management interface towards the host is available and the data path is deactivated. User space can choose to put modules that are not currently in use in low power mode and transition them to high power mode before putting the associated ports administratively up. This is useful for user space that favors reduced power consumption and lower temperatures over reduced link up times. In QSFP-DD modules the transition from low power mode to high power mode can take a few seconds and this transition is only expected to get longer with future / more complex modules. User space can control the power mode of the module via the power mode policy attribute ('ETHTOOL_A_MODULE_POWER_MODE_POLICY'). Possible values: * high: Module is always in high power mode. * auto: Module is transitioned by the host to high power mode when the first port using it is put administratively up and to low power mode when the last port using it is put administratively down. The operational power mode of the module is available to user space via the 'ETHTOOL_A_MODULE_POWER_MODE' attribute. The attribute is not reported to user space when a module is not plugged-in. The user API is designed to be generic enough so that it could be used for modules with different memory maps (e.g., SFF-8636, CMIS). The only implementation of the device driver API in this series is for a MAC driver (mlxsw) where the module is controlled by the device's firmware, but it is designed to be generic enough so that it could also be used by implementations where the module is controlled by the CPU. CMIS testing ============ # ethtool -m swp11 Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628)) ... Module State : 0x03 (ModuleReady) LowPwrAllowRequestHW : Off LowPwrRequestSW : Off The module is not in low power mode, as it is not forced by hardware (LowPwrAllowRequestHW is off) or by software (LowPwrRequestSW is off). The power mode can be queried from the kernel. In case LowPwrAllowRequestHW was on, the kernel would need to take into account the state of the LowPwrRequestHW signal, which is not visible to user space. $ ethtool --show-module swp11 Module parameters for swp11: power-mode-policy high power-mode high Change the power mode policy to 'auto': # ethtool --set-module swp11 power-mode-policy auto Query the power mode again: $ ethtool --show-module swp11 Module parameters for swp11: power-mode-policy auto power-mode low Verify with the data read from the EEPROM: # ethtool -m swp11 Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628)) ... Module State : 0x01 (ModuleLowPwr) LowPwrAllowRequestHW : Off LowPwrRequestSW : On Put the associated port administratively up which will instruct the host to transition the module to high power mode: # ip link set dev swp11 up Query the power mode again: $ ethtool --show-module swp11 Module parameters for swp11: power-mode-policy auto power-mode high Verify with the data read from the EEPROM: # ethtool -m swp11 Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628)) ... Module State : 0x03 (ModuleReady) LowPwrAllowRequestHW : Off LowPwrRequestSW : Off Put the associated port administratively down which will instruct the host to transition the module to low power mode: # ip link set dev swp11 down Query the power mode again: $ ethtool --show-module swp11 Module parameters for swp11: power-mode-policy auto power-mode low Verify with the data read from the EEPROM: # ethtool -m swp11 Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628)) ... Module State : 0x01 (ModuleLowPwr) LowPwrAllowRequestHW : Off LowPwrRequestSW : On SFF-8636 testing ================ # ethtool -m swp13 Identifier : 0x11 (QSFP28) ... Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) enabled Power set : Off Power override : On ... Transmit avg optical power (Channel 1) : 0.7733 mW / -1.12 dBm Transmit avg optical power (Channel 2) : 0.7649 mW / -1.16 dBm Transmit avg optical power (Channel 3) : 0.7790 mW / -1.08 dBm Transmit avg optical power (Channel 4) : 0.7837 mW / -1.06 dBm Rcvr signal avg optical power(Channel 1) : 0.9302 mW / -0.31 dBm Rcvr signal avg optical power(Channel 2) : 0.9079 mW / -0.42 dBm Rcvr signal avg optical power(Channel 3) : 0.8993 mW / -0.46 dBm Rcvr signal avg optical power(Channel 4) : 0.8778 mW / -0.57 dBm The module is not in low power mode, as it is not forced by hardware (Power override is on) or by software (Power set is off). The power mode can be queried from the kernel. In case Power override was off, the kernel would need to take into account the state of the LPMode signal, which is not visible to user space. $ ethtool --show-module swp13 Module parameters for swp13: power-mode-policy high power-mode high Change the power mode policy to 'auto': # ethtool --set-module swp13 power-mode-policy auto Query the power mode again: $ ethtool --show-module swp13 Module parameters for swp13: power-mode-policy auto power-mode low Verify with the data read from the EEPROM: # ethtool -m swp13 Identifier : 0x11 (QSFP28) Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) not enabled Power set : On Power override : On ... Transmit avg optical power (Channel 1) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 2) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 3) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 4) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 1) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 2) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 3) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 4) : 0.0000 mW / -inf dBm Put the associated port administratively up which will instruct the host to transition the module to high power mode: # ip link set dev swp13 up Query the power mode again: $ ethtool --show-module swp13 Module parameters for swp13: power-mode-policy auto power-mode high Verify with the data read from the EEPROM: # ethtool -m swp13 Identifier : 0x11 (QSFP28) ... Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) enabled Power set : Off Power override : On ... Transmit avg optical power (Channel 1) : 0.7934 mW / -1.01 dBm Transmit avg optical power (Channel 2) : 0.7859 mW / -1.05 dBm Transmit avg optical power (Channel 3) : 0.7885 mW / -1.03 dBm Transmit avg optical power (Channel 4) : 0.7985 mW / -0.98 dBm Rcvr signal avg optical power(Channel 1) : 0.9325 mW / -0.30 dBm Rcvr signal avg optical power(Channel 2) : 0.9034 mW / -0.44 dBm Rcvr signal avg optical power(Channel 3) : 0.9086 mW / -0.42 dBm Rcvr signal avg optical power(Channel 4) : 0.8885 mW / -0.51 dBm Put the associated port administratively down which will instruct the host to transition the module to low power mode: # ip link set dev swp13 down Query the power mode again: $ ethtool --show-module swp13 Module parameters for swp13: power-mode-policy auto power-mode low Verify with the data read from the EEPROM: # ethtool -m swp13 Identifier : 0x11 (QSFP28) ... Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) not enabled Power set : On Power override : On ... Transmit avg optical power (Channel 1) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 2) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 3) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 4) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 1) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 2) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 3) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 4) : 0.0000 mW / -inf dBm Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 849524b55d89..9adf8d2c3144 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -415,6 +415,17 @@ struct ethtool_module_eeprom { u8 *data; }; +/** + * struct ethtool_module_power_mode_params - module power mode parameters + * @policy: The power mode policy enforced by the host for the plug-in module. + * @mode: The operational power mode of the plug-in module. Should be filled by + * device drivers on get operations. + */ +struct ethtool_module_power_mode_params { + enum ethtool_module_power_mode_policy policy; + enum ethtool_module_power_mode mode; +}; + /** * struct ethtool_ops - optional netdev operations * @cap_link_lanes_supported: indicates if the driver supports lanes @@ -580,6 +591,11 @@ struct ethtool_module_eeprom { * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics. * @get_rmon_stats: Query some of the RMON (RFC 2819) statistics. * Set %ranges to a pointer to zero-terminated array of byte ranges. + * @get_module_power_mode: Get the power mode policy for the plug-in module + * used by the network device and its operational power mode, if + * plugged-in. + * @set_module_power_mode: Set the power mode policy for the plug-in module + * used by the network device. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -705,6 +721,12 @@ struct ethtool_ops { void (*get_rmon_stats)(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges); + int (*get_module_power_mode)(struct net_device *dev, + struct ethtool_module_power_mode_params *params, + struct netlink_ext_ack *extack); + int (*set_module_power_mode)(struct net_device *dev, + const struct ethtool_module_power_mode_params *params, + struct netlink_ext_ack *extack); }; int ethtool_check_ops(const struct ethtool_ops *ops); -- cgit v1.2.3 From 3dfb51126064b594470b9c0b278188fbc9194709 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 6 Oct 2021 13:46:46 +0300 Subject: ethtool: Add transceiver module extended state Add an extended state and sub-state to describe link issues related to transceiver modules. The 'ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY' extended sub-state tells user space that port is unable to gain a carrier because the CMIS Module State Machine did not reach the ModuleReady (Fully Operational) state. For example, if the module is stuck at ModuleLowPwr or ModuleFault state. In case of the latter, user space can read the fault reason from the module's EEPROM and potentially reset it. Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 9adf8d2c3144..845a0ffc16ee 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -94,6 +94,7 @@ struct ethtool_link_ext_state_info { enum ethtool_link_ext_substate_link_logical_mismatch link_logical_mismatch; enum ethtool_link_ext_substate_bad_signal_integrity bad_signal_integrity; enum ethtool_link_ext_substate_cable_issue cable_issue; + enum ethtool_link_ext_substate_module module; u8 __link_ext_substate; }; }; -- cgit v1.2.3 From 79365f36d1de87286bb4fc0abcb2a01678ef4bef Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 6 Oct 2021 13:19:20 +0100 Subject: net: mdio: add mdiobus_modify_changed() Add mdiobus_modify_changed() helper to reflect the phylib and similar equivalents. This will avoid this functionality being open-coded, as has already happened in phylink, and it looks like other users will be appearing soon. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 5e6dc38f418e..f622888a4ba8 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -349,6 +349,8 @@ int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val); int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask, u16 set); +int mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum, + u16 mask, u16 set); static inline u32 mdiobus_c45_addr(int devad, u16 regnum) { -- cgit v1.2.3 From af8cc9600bbf2251b04c56139f7c83f87c3f878a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Sep 2021 14:10:51 -0300 Subject: futex: Split out syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Put the syscalls in their own little file. Signed-off-by: Peter Zijlstra (Intel) Suggested-by: Thomas Gleixner Signed-off-by: AndrĂ© Almeida Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: AndrĂ© Almeida Link: https://lore.kernel.org/r/20210923171111.300673-3-andrealmeid@collabora.com --- include/linux/syscalls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 252243c7783d..25979682ade5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -610,7 +610,7 @@ asmlinkage long sys_waitid(int which, pid_t pid, asmlinkage long sys_set_tid_address(int __user *tidptr); asmlinkage long sys_unshare(unsigned long unshare_flags); -/* kernel/futex.c */ +/* kernel/futex/syscalls.c */ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, const struct __kernel_timespec __user *utime, u32 __user *uaddr2, u32 val3); -- cgit v1.2.3 From bf69bad38cf63d980e8a603f8d1bd1f85b5ed3d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Thu, 23 Sep 2021 14:11:05 -0300 Subject: futex: Implement sys_futex_waitv() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to wait on multiple futexes. This is the interface implemented by this syscall: futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes, unsigned int flags, struct timespec *timeout, clockid_t clockid) struct futex_waitv { __u64 val; __u64 uaddr; __u32 flags; __u32 __reserved; }; Given an array of struct futex_waitv, wait on each uaddr. The thread wakes if a futex_wake() is performed at any uaddr. The syscall returns immediately if any waiter has *uaddr != val. *timeout is an optional absolute timeout value for the operation. This syscall supports only 64bit sized timeout structs. The flags argument of the syscall should be empty, but it can be used for future extensions. Flags for shared futexes, sizes, etc. should be used on the individual flags of each waiter. __reserved is used for explicit padding and should be 0, but it might be used for future extensions. If the userspace uses 32-bit pointers, it should make sure to explicitly cast it when assigning to waitv::uaddr. Returns the array index of one of the woken futexes. There’s no given information of how many were woken, or any particular attribute of it (if it’s the first woken, if it is of the smaller index...). Signed-off-by: AndrĂ© Almeida Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210923171111.300673-17-andrealmeid@collabora.com --- include/linux/syscalls.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 25979682ade5..528a478dbda8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -58,6 +58,7 @@ struct mq_attr; struct compat_stat; struct old_timeval32; struct robust_list_head; +struct futex_waitv; struct getcpu_cache; struct old_linux_dirent; struct perf_event_attr; @@ -623,6 +624,10 @@ asmlinkage long sys_get_robust_list(int pid, asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); +asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, + unsigned int nr_futexes, unsigned int flags, + struct __kernel_timespec __user *timeout, clockid_t clockid); + /* kernel/hrtimer.c */ asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, struct __kernel_timespec __user *rmtp); -- cgit v1.2.3 From 9b3c4ab3045e953670c7de9c1165fae5358a7237 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Sep 2021 21:54:32 +0200 Subject: sched,rcu: Rework try_invoke_on_locked_down_task() Give try_invoke_on_locked_down_task() a saner name and have it return an int so that the caller might distinguish between different reasons of failure. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Acked-by: Vasily Gorbik Tested-by: Vasily Gorbik # on s390 Link: https://lkml.kernel.org/r/20210929152428.649944917@infradead.org --- include/linux/wait.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 93dab0e9580f..2d0df57c9902 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1160,6 +1160,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i (wait)->flags = 0; \ } while (0) -bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg); +typedef int (*task_call_f)(struct task_struct *p, void *arg); +extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); #endif /* _LINUX_WAIT_H */ -- cgit v1.2.3 From e330fb14590c5c80f7195c3d8c9b4bcf79e1a5cd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Oct 2021 18:06:54 -0700 Subject: of: net: move of_net under net/ Rob suggests to move of_net.c from under drivers/of/ somewhere to the networking code. Suggested-by: Rob Herring Signed-off-by: Jakub Kicinski Reviewed-by: Rob Herring Signed-off-by: David S. Miller --- include/linux/of_net.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_net.h b/include/linux/of_net.h index daef3b0d9270..cf31188329b5 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -8,7 +8,7 @@ #include -#ifdef CONFIG_OF_NET +#ifdef CONFIG_OF #include struct net_device; -- cgit v1.2.3 From d466effe282ddbab6acb6c3120c1de0ee1b86d57 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Oct 2021 18:06:55 -0700 Subject: of: net: add a helper for loading netdev->dev_addr Commit 406f42fa0d3c ("net-next: When a bond have a massive amount of VLANs...") introduced a rbtree for faster Ethernet address look up. To maintain netdev->dev_addr in this tree we need to make all the writes to it got through appropriate helpers. There are roughly 40 places where netdev->dev_addr is passed as the destination to a of_get_mac_address() call. Add a helper which takes a dev pointer instead, so it can call an appropriate helper. Note that of_get_mac_address() already assumes the address is 6 bytes long (ETH_ALEN) so use eth_hw_addr_set(). Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/of_net.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_net.h b/include/linux/of_net.h index cf31188329b5..0797e2edb8c2 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -14,6 +14,7 @@ struct net_device; extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface); extern int of_get_mac_address(struct device_node *np, u8 *mac); +int of_get_ethdev_address(struct device_node *np, struct net_device *dev); extern struct net_device *of_find_net_device_by_node(struct device_node *np); #else static inline int of_get_phy_mode(struct device_node *np, @@ -27,6 +28,11 @@ static inline int of_get_mac_address(struct device_node *np, u8 *mac) return -ENODEV; } +static inline int of_get_ethdev_address(struct device_node *np, struct net_device *dev) +{ + return -ENODEV; +} + static inline struct net_device *of_find_net_device_by_node(struct device_node *np) { return NULL; -- cgit v1.2.3 From 433baf0719d6a81d0587ea27545a120a3880abf6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Oct 2021 18:06:57 -0700 Subject: device property: move mac addr helpers to eth.c Move the mac address helpers out, eth.c already contains a bunch of similar helpers. Suggested-by: Heikki Krogerus Acked-by: Greg Kroah-Hartman Signed-off-by: Jakub Kicinski Reviewed-by: Heikki Krogerus Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 6 ++++++ include/linux/property.h | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index e7b2e5fd8d24..39f0758274ae 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -26,9 +26,15 @@ #ifdef __KERNEL__ struct device; +struct fwnode_handle; + int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); +void *device_get_mac_address(struct device *dev, char *addr, int alen); +void *fwnode_get_mac_address(struct fwnode_handle *fwnode, + char *addr, int alen); + u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; diff --git a/include/linux/property.h b/include/linux/property.h index 357513a977e5..4fb081684255 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -389,11 +389,7 @@ const void *device_get_match_data(struct device *dev); int device_get_phy_mode(struct device *dev); -void *device_get_mac_address(struct device *dev, char *addr, int alen); - int fwnode_get_phy_mode(struct fwnode_handle *fwnode); -void *fwnode_get_mac_address(struct fwnode_handle *fwnode, - char *addr, int alen); struct fwnode_handle *fwnode_graph_get_next_endpoint( const struct fwnode_handle *fwnode, struct fwnode_handle *prev); struct fwnode_handle * -- cgit v1.2.3 From 8017c4d8173cfe086420dc5710d631cabd03ef67 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Oct 2021 18:06:58 -0700 Subject: eth: fwnode: change the return type of mac address helpers fwnode_get_mac_address() and device_get_mac_address() return a pointer to the buffer that was passed to them on success or NULL on failure. None of the callers care about the actual value, only if it's NULL or not. These semantics differ from of_get_mac_address() which returns an int so to avoid confusion make the device helpers return an errno. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 39f0758274ae..8299f1cd9175 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -31,9 +31,8 @@ struct fwnode_handle; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); -void *device_get_mac_address(struct device *dev, char *addr, int alen); -void *fwnode_get_mac_address(struct fwnode_handle *fwnode, - char *addr, int alen); +int device_get_mac_address(struct device *dev, char *addr, int alen); +int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr, int alen); u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); -- cgit v1.2.3 From 0a14501ed818ff51eed237bbe5009d0d784e4450 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Oct 2021 18:06:59 -0700 Subject: eth: fwnode: remove the addr len from mac helpers All callers pass in ETH_ALEN and the function itself will return -EINVAL for any other address length. Just assume it's ETH_ALEN like all other mac address helpers (nvm, of, platform). Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 8299f1cd9175..bb612c7382e3 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -31,8 +31,8 @@ struct fwnode_handle; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); -int device_get_mac_address(struct device *dev, char *addr, int alen); -int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr, int alen); +int device_get_mac_address(struct device *dev, char *addr); +int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr); u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); -- cgit v1.2.3 From d9eb44904e87c8ad1da0240849dbab638bacb799 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Oct 2021 18:07:00 -0700 Subject: eth: fwnode: add a helper for loading netdev->dev_addr Commit 406f42fa0d3c ("net-next: When a bond have a massive amount of VLANs...") introduced a rbtree for faster Ethernet address look up. To maintain netdev->dev_addr in this tree we need to make all the writes to it got through appropriate helpers. There is a handful of drivers which pass netdev->dev_addr as the destination buffer to device_get_mac_address(). Add a helper which takes a dev pointer instead, so it can call an appropriate helper. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 1 + include/linux/property.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index bb612c7382e3..a8bb64cf4079 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -32,6 +32,7 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); int device_get_mac_address(struct device *dev, char *addr); +int device_get_ethdev_address(struct device *dev, struct net_device *netdev); int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr); u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len); diff --git a/include/linux/property.h b/include/linux/property.h index 4fb081684255..88fa726a76df 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -15,6 +15,7 @@ #include struct device; +struct net_device; enum dev_prop_type { DEV_PROP_U8, -- cgit v1.2.3 From bdc7ca008e1f5539e891187032cb2cbbc3decb5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 7 Oct 2021 14:14:13 +0200 Subject: spi: Remove unused function spi_busnum_to_master() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The last user is gone since commit 2962db71c703 ("staging/fbtft: Remove fbtft_device") in 2019. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20211007121415.2401638-3-u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 8371bca13729..f8e322a46616 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -760,8 +760,6 @@ extern int devm_spi_register_controller(struct device *dev, struct spi_controller *ctlr); extern void spi_unregister_controller(struct spi_controller *ctlr); -extern struct spi_controller *spi_busnum_to_master(u16 busnum); - /* * SPI resource management while processing a SPI message */ -- cgit v1.2.3 From da21fde0fdb393c2fbe0ae0735cc826cd55fd46f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 7 Oct 2021 14:14:15 +0200 Subject: spi: Make several public functions private to spi.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All these functions have no callers apart from drivers/spi/spi.c. So drop their declarations in include/linux/spi/spi.h and don't export them. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20211007121415.2401638-5-u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 53 ------------------------------------------------- 1 file changed, 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index f8e322a46616..29e21d49aafc 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -78,10 +78,6 @@ struct spi_statistics { unsigned long transfers_split_maxsize; }; -void spi_statistics_add_transfer_stats(struct spi_statistics *stats, - struct spi_transfer *xfer, - struct spi_controller *ctlr); - #define SPI_STATISTICS_ADD_TO_FIELD(stats, field, count) \ do { \ unsigned long flags; \ @@ -783,15 +779,6 @@ struct spi_res { unsigned long long data[]; /* guarantee ull alignment */ }; -extern void *spi_res_alloc(struct spi_device *spi, - spi_res_release_t release, - size_t size, gfp_t gfp); -extern void spi_res_add(struct spi_message *message, void *res); -extern void spi_res_free(void *res); - -extern void spi_res_release(struct spi_controller *ctlr, - struct spi_message *message); - /*---------------------------------------------------------------------------*/ /* @@ -1109,8 +1096,6 @@ static inline void spi_message_free(struct spi_message *m) extern int spi_setup(struct spi_device *spi); extern int spi_async(struct spi_device *spi, struct spi_message *message); -extern int spi_async_locked(struct spi_device *spi, - struct spi_message *message); extern int spi_slave_abort(struct spi_device *spi); static inline size_t @@ -1193,15 +1178,6 @@ struct spi_replaced_transfers { struct spi_transfer inserted_transfers[]; }; -extern struct spi_replaced_transfers *spi_replace_transfers( - struct spi_message *msg, - struct spi_transfer *xfer_first, - size_t remove, - size_t insert, - spi_replaced_release_t release, - size_t extradatasize, - gfp_t gfp); - /*---------------------------------------------------------------------------*/ /* SPI transfer transformation methods */ @@ -1473,19 +1449,7 @@ spi_register_board_info(struct spi_board_info const *info, unsigned n) * use spi_new_device() to describe each device. You can also call * spi_unregister_device() to start making that device vanish, but * normally that would be handled by spi_unregister_controller(). - * - * You can also use spi_alloc_device() and spi_add_device() to use a two - * stage registration sequence for each spi_device. This gives the caller - * some more control over the spi_device structure before it is registered, - * but requires that caller to initialize fields that would otherwise - * be defined using the board info. */ -extern struct spi_device * -spi_alloc_device(struct spi_controller *ctlr); - -extern int -spi_add_device(struct spi_device *spi); - extern struct spi_device * spi_new_device(struct spi_controller *, struct spi_board_info *); @@ -1500,23 +1464,6 @@ spi_transfer_is_last(struct spi_controller *ctlr, struct spi_transfer *xfer) return list_is_last(&xfer->transfer_list, &ctlr->cur_msg->transfers); } -/* OF support code */ -#if IS_ENABLED(CONFIG_OF) - -/* must call put_device() when done with returned spi_device device */ -extern struct spi_device * -of_find_spi_device_by_node(struct device_node *node); - -#else - -static inline struct spi_device * -of_find_spi_device_by_node(struct device_node *node) -{ - return NULL; -} - -#endif /* IS_ENABLED(CONFIG_OF) */ - /* Compatibility layer */ #define spi_master spi_controller -- cgit v1.2.3 From 424953cf3c6657f1e67e1a2c5d6e3bb518ea4e9a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 Sep 2021 09:50:27 +0200 Subject: qcom_scm: hide Kconfig symbol Now that SCM can be a loadable module, we have to add another dependency to avoid link failures when ipa or adreno-gpu are built-in: aarch64-linux-ld: drivers/net/ipa/ipa_main.o: in function `ipa_probe': ipa_main.c:(.text+0xfc4): undefined reference to `qcom_scm_is_available' ld.lld: error: undefined symbol: qcom_scm_is_available >>> referenced by adreno_gpu.c >>> gpu/drm/msm/adreno/adreno_gpu.o:(adreno_zap_shader_load) in archive drivers/built-in.a This can happen when CONFIG_ARCH_QCOM is disabled and we don't select QCOM_MDT_LOADER, but some other module selects QCOM_SCM. Ideally we'd use a similar dependency here to what we have for QCOM_RPROC_COMMON, but that causes dependency loops from other things selecting QCOM_SCM. This appears to be an endless problem, so try something different this time: - CONFIG_QCOM_SCM becomes a hidden symbol that nothing 'depends on' but that is simply selected by all of its users - All the stubs in include/linux/qcom_scm.h can go away - arm-smccc.h needs to provide a stub for __arm_smccc_smc() to allow compile-testing QCOM_SCM on all architectures. - To avoid a circular dependency chain involving RESET_CONTROLLER and PINCTRL_SUNXI, drop the 'select RESET_CONTROLLER' statement. According to my testing this still builds fine, and the QCOM platform selects this symbol already. Acked-by: Kalle Valo Acked-by: Alex Elder Signed-off-by: Arnd Bergmann --- include/linux/arm-smccc.h | 10 +++++++ include/linux/qcom_scm.h | 71 ----------------------------------------------- 2 files changed, 10 insertions(+), 71 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 7d1cabe15262..63ccb5252190 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -321,10 +321,20 @@ asmlinkage unsigned long __arm_smccc_sve_check(unsigned long x0); * from register 0 to 3 on return from the SMC instruction. An optional * quirk structure provides vendor specific behavior. */ +#ifdef CONFIG_HAVE_ARM_SMCCC asmlinkage void __arm_smccc_smc(unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4, unsigned long a5, unsigned long a6, unsigned long a7, struct arm_smccc_res *res, struct arm_smccc_quirk *quirk); +#else +static inline void __arm_smccc_smc(unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3, unsigned long a4, + unsigned long a5, unsigned long a6, unsigned long a7, + struct arm_smccc_res *res, struct arm_smccc_quirk *quirk) +{ + *res = (struct arm_smccc_res){}; +} +#endif /** * __arm_smccc_hvc() - make HVC calls diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h index c0475d1c9885..81cad9e1e412 100644 --- a/include/linux/qcom_scm.h +++ b/include/linux/qcom_scm.h @@ -61,7 +61,6 @@ enum qcom_scm_ice_cipher { #define QCOM_SCM_PERM_RW (QCOM_SCM_PERM_READ | QCOM_SCM_PERM_WRITE) #define QCOM_SCM_PERM_RWX (QCOM_SCM_PERM_RW | QCOM_SCM_PERM_EXEC) -#if IS_ENABLED(CONFIG_QCOM_SCM) extern bool qcom_scm_is_available(void); extern int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus); @@ -115,74 +114,4 @@ extern int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, extern int qcom_scm_lmh_profile_change(u32 profile_id); extern bool qcom_scm_lmh_dcvsh_available(void); -#else - -#include - -static inline bool qcom_scm_is_available(void) { return false; } - -static inline int qcom_scm_set_cold_boot_addr(void *entry, - const cpumask_t *cpus) { return -ENODEV; } -static inline int qcom_scm_set_warm_boot_addr(void *entry, - const cpumask_t *cpus) { return -ENODEV; } -static inline void qcom_scm_cpu_power_down(u32 flags) {} -static inline u32 qcom_scm_set_remote_state(u32 state,u32 id) - { return -ENODEV; } - -static inline int qcom_scm_pas_init_image(u32 peripheral, const void *metadata, - size_t size) { return -ENODEV; } -static inline int qcom_scm_pas_mem_setup(u32 peripheral, phys_addr_t addr, - phys_addr_t size) { return -ENODEV; } -static inline int qcom_scm_pas_auth_and_reset(u32 peripheral) - { return -ENODEV; } -static inline int qcom_scm_pas_shutdown(u32 peripheral) { return -ENODEV; } -static inline bool qcom_scm_pas_supported(u32 peripheral) { return false; } - -static inline int qcom_scm_io_readl(phys_addr_t addr, unsigned int *val) - { return -ENODEV; } -static inline int qcom_scm_io_writel(phys_addr_t addr, unsigned int val) - { return -ENODEV; } - -static inline bool qcom_scm_restore_sec_cfg_available(void) { return false; } -static inline int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare) - { return -ENODEV; } -static inline int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size) - { return -ENODEV; } -static inline int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare) - { return -ENODEV; } -extern inline int qcom_scm_mem_protect_video_var(u32 cp_start, u32 cp_size, - u32 cp_nonpixel_start, - u32 cp_nonpixel_size) - { return -ENODEV; } -static inline int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz, - unsigned int *src, const struct qcom_scm_vmperm *newvm, - unsigned int dest_cnt) { return -ENODEV; } - -static inline bool qcom_scm_ocmem_lock_available(void) { return false; } -static inline int qcom_scm_ocmem_lock(enum qcom_scm_ocmem_client id, u32 offset, - u32 size, u32 mode) { return -ENODEV; } -static inline int qcom_scm_ocmem_unlock(enum qcom_scm_ocmem_client id, - u32 offset, u32 size) { return -ENODEV; } - -static inline bool qcom_scm_ice_available(void) { return false; } -static inline int qcom_scm_ice_invalidate_key(u32 index) { return -ENODEV; } -static inline int qcom_scm_ice_set_key(u32 index, const u8 *key, u32 key_size, - enum qcom_scm_ice_cipher cipher, - u32 data_unit_size) { return -ENODEV; } - -static inline bool qcom_scm_hdcp_available(void) { return false; } -static inline int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, - u32 *resp) { return -ENODEV; } - -static inline int qcom_scm_qsmmu500_wait_safe_toggle(bool en) - { return -ENODEV; } - -static inline int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, - u64 limit_node, u32 node_id, u64 version) - { return -ENODEV; } - -static inline int qcom_scm_lmh_profile_change(u32 profile_id) { return -ENODEV; } - -static inline bool qcom_scm_lmh_dcvsh_available(void) { return -ENODEV; } -#endif #endif -- cgit v1.2.3 From ba882580f211dbe4fee7f010c9d38dd879db83a6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 7 Oct 2021 11:18:46 -0700 Subject: eth: platform: add a helper for loading netdev->dev_addr Commit 406f42fa0d3c ("net-next: When a bond have a massive amount of VLANs...") introduced a rbtree for faster Ethernet address look up. To maintain netdev->dev_addr in this tree we need to make all the writes to it got through appropriate helpers. There is a handful of drivers which pass netdev->dev_addr as the destination buffer to eth_platform_get_mac_address(). Add a helper which takes a dev pointer instead, so it can call an appropriate helper. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index e75116f48cd1..3cf546d2ffd1 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -29,6 +29,7 @@ struct device; struct fwnode_handle; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); +int platform_get_ethdev_address(struct device *dev, struct net_device *netdev); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); int device_get_mac_address(struct device *dev, char *addr); -- cgit v1.2.3 From 75ea27d0d62281c31ee259c872dfdeb072cf5e39 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 7 Oct 2021 18:16:50 +0200 Subject: net: introduce a function to check if a netdev name is in use __dev_get_by_name is currently used to either retrieve a net device reference using its name or to check if a name is already used by a registered net device (per ns). In the later case there is no need to return a reference to a net device. Introduce a new helper, netdev_name_in_use, to check if a name is currently used by a registered net device without leaking a reference the corresponding net device. This helper uses netdev_name_node_lookup instead of __dev_get_by_name as we don't need the extra logic retrieving a reference to the corresponding net device. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d79163208dfd..15f4a658e436 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2955,6 +2955,7 @@ struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags, struct net_device *dev_get_by_name(struct net *net, const char *name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); +bool netdev_name_in_use(struct net *net, const char *name); int dev_alloc_name(struct net_device *dev, const char *name); int dev_open(struct net_device *dev, struct netlink_ext_ack *extack); void dev_close(struct net_device *dev); -- cgit v1.2.3 From 0258b5fd7c7124b87e185a1a9322d2c66b1876b7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 22 Sep 2021 11:24:02 -0500 Subject: coredump: Limit coredumps to a single thread group Today when a signal is delivered with a handler of SIG_DFL whose default behavior is to generate a core dump not only that process but every process that shares the mm is killed. In the case of vfork this looks like a real world problem. Consider the following well defined sequence. if (vfork() == 0) { execve(...); _exit(EXIT_FAILURE); } If a signal that generates a core dump is received after vfork but before the execve changes the mm the process that called vfork will also be killed (as the mm is shared). Similarly if the execve fails after the point of no return the kernel delivers SIGSEGV which will kill both the exec'ing process and because the mm is shared the process that called vfork as well. As far as I can tell this behavior is a violation of people's reasonable expectations, POSIX, and is unnecessarily fragile when the system is low on memory. Solve this by making a userspace visible change to only kill a single process/thread group. This is possible because Jann Horn recently modified[1] the coredump code so that the mm can safely be modified while the coredump is happening. With LinuxThreads long gone I don't expect anyone to have a notice this behavior change in practice. To accomplish this move the core_state pointer from mm_struct to signal_struct, which allows different thread groups to coredump simultatenously. In zap_threads remove the work to kill anything except for the current thread group. v2: Remove core_state from the VM_BUG_ON_MM print to fix compile failure when CONFIG_DEBUG_VM is enabled. Reported-by: Stephen Rothwell [1] a07279c9a8cd ("binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot") Fixes: d89f3847def4 ("[PATCH] thread-aware coredumps, 2.5.43-C3") History-tree: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Link: https://lkml.kernel.org/r/87y27mvnke.fsf@disp2133 Link: https://lkml.kernel.org/r/20211007144701.67592574@canb.auug.org.au Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- include/linux/mm_types.h | 13 ------------- include/linux/sched/signal.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..1039f6ae922c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -387,17 +387,6 @@ struct vm_area_struct { struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; -struct core_thread { - struct task_struct *task; - struct core_thread *next; -}; - -struct core_state { - atomic_t nr_threads; - struct core_thread dumper; - struct completion startup; -}; - struct kioctx_table; struct mm_struct { struct { @@ -518,8 +507,6 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access */ - struct core_state *core_state; /* coredumping support */ - #ifdef CONFIG_AIO spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index e5f4ce622ee6..a8fe2a593a3a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -72,6 +72,17 @@ struct multiprocess_signals { struct hlist_node node; }; +struct core_thread { + struct task_struct *task; + struct core_thread *next; +}; + +struct core_state { + atomic_t nr_threads; + struct core_thread dumper; + struct completion startup; +}; + /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always @@ -110,6 +121,8 @@ struct signal_struct { int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ + struct core_state *core_state; /* coredumping support */ + /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes -- cgit v1.2.3 From 5bded8259ee3815a91791462dfb3312480779c3d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 7 Oct 2021 19:47:11 +0300 Subject: net: dsa: mv88e6xxx: isolate the ATU databases of standalone and bridged ports Similar to commit 6087175b7991 ("net: dsa: mt7530: use independent VLAN learning on VLAN-unaware bridges"), software forwarding between an unoffloaded LAG port (a bonding interface with an unsupported policy) and a mv88e6xxx user port directly under a bridge is broken. We adopt the same strategy, which is to make the standalone ports not find any ATU entry learned on a bridge port. Theory: the mv88e6xxx ATU is looked up by FID and MAC address. There are as many FIDs as VIDs (4096). The FID is derived from the VID when possible (the VTU maps a VID to a FID), with a fallback to the port based default FID value when not (802.1Q Mode is disabled on the port, or the classified VID isn't present in the VTU). The mv88e6xxx driver makes the following use of FIDs and VIDs: - the port's DefaultVID (to which untagged & pvid-tagged packets get classified) is 0 and is absent from the VTU, so this kind of packets is processed in FID 0, the default FID assigned by mv88e6xxx_setup_port. - every time a bridge VLAN is created, mv88e6xxx_port_vlan_join() -> mv88e6xxx_atu_new() associates a FID with that VID which increases linearly starting from 1. Like this: bridge vlan add dev lan0 vid 100 # FID 1 bridge vlan add dev lan1 vid 100 # still FID 1 bridge vlan add dev lan2 vid 1024 # FID 2 The FID allocation made by the driver is sub-optimal for the following reasons: (a) A standalone port has a DefaultPVID of 0 and a default FID of 0 too. A VLAN-unaware bridged port has a DefaultPVID of 0 and a default FID of 0 too. The difference is that the bridged ports may learn ATU entries, while the standalone port has the requirement that it must not, and must not find them either. Standalone ports must not use the same FID as ports belonging to a bridge. All standalone ports can use the same FID, since the ATU will never have an entry in that FID. (b) Multiple VLAN-unaware bridges will all use a DefaultPVID of 0 and a default FID of 0 on all their ports. The FDBs will not be isolated between these bridges. Every VLAN-unaware bridge must use the same FID on all its ports, different from the FID of other bridge ports. (c) Each bridge VLAN uses a unique FID which is useful for Independent VLAN Learning, but the same VLAN ID on multiple VLAN-aware bridges will result in the same FID being used by mv88e6xxx_atu_new(). The correct behavior is for VLAN 1 in br0 to have a different FID compared to VLAN 1 in br1. This patch cannot fix all the above. Traditionally the DSA framework did not care about this, and the reality is that DSA core involvement is needed for the aforementioned issues to be solved. The only thing we can solve here is an issue which does not require API changes, and that is issue (a), aka use a different FID for standalone ports vs ports under VLAN-unaware bridges. The first step is deciding what VID and FID to use for standalone ports, and what VID and FID for bridged ports. The 0/0 pair for standalone ports is what they used up till now, let's keep using that. For bridged ports, there are 2 cases: - VLAN-aware ports will never end up using the port default FID, because packets will always be classified to a VID in the VTU or dropped otherwise. The FID is the one associated with the VID in the VTU. - On VLAN-unaware ports, we _could_ leave their DefaultVID (pvid) at zero (just as in the case of standalone ports), and just change the port's default FID from 0 to a different number (say 1). However, Tobias points out that there is one more requirement to cater to: cross-chip bridging. The Marvell DSA header does not carry the FID in it, only the VID. So once a packet crosses a DSA link, if it has a VID of zero it will get classified to the default FID of that cascade port. Relying on a port default FID for upstream cascade ports results in contradictions: a default FID of 0 breaks ATU isolation of bridged ports on the downstream switch, a default FID of 1 breaks standalone ports on the downstream switch. So not only must standalone ports have different FIDs compared to bridged ports, they must also have different DefaultVID values. IEEE 802.1Q defines two reserved VID values: 0 and 4095. So we simply choose 4095 as the DefaultVID of ports belonging to VLAN-unaware bridges, and VID 4095 maps to FID 1. For the xmit operation to look up the same ATU database, we need to put VID 4095 in DSA tags sent to ports belonging to VLAN-unaware bridges too. All shared ports are configured to map this VID to the bridging FID, because they are members of that VLAN in the VTU. Shared ports don't need to have 802.1QMode enabled in any way, they always parse the VID from the DSA header, they don't need to look at the 802.1Q header. We install VID 4095 to the VTU in mv88e6xxx_setup_port(), with the mention that mv88e6xxx_vtu_setup() which was located right below that call was flushing the VTU so those entries wouldn't be preserved. So we need to relocate the VTU flushing prior to the port initialization during ->setup(). Also note that this is why it is safe to assume that VID 4095 will get associated with FID 1: the user ports haven't been created, so there is no avenue for the user to create a bridge VLAN which could otherwise race with the creation of another FID which would otherwise use up the non-reserved FID value of 1. [ Currently mv88e6xxx_port_vlan_join() doesn't have the option of specifying a preferred FID, it always calls mv88e6xxx_atu_new(). ] mv88e6xxx_port_db_load_purge() is the function to access the ATU for FDB/MDB entries, and it used to determine the FID to use for VLAN-unaware FDB entries (VID=0) using mv88e6xxx_port_get_fid(). But the driver only called mv88e6xxx_port_set_fid() once, during probe, so no surprises, the port FID was always 0, the call to get_fid() was redundant. As much as I would have wanted to not touch that code, the logic is broken when we add a new FID which is not the port-based default. Now the port-based default FID only corresponds to standalone ports, and FDB/MDB entries belong to the bridging service. So while in the future, when the DSA API will support FDB isolation, we will have to figure out the FID based on the bridge number, for now there's a single bridging FID, so hardcode that. Lastly, the tagger needs to check, when it is transmitting a VLAN untagged skb, whether it is sending it towards a bridged or a standalone port. When we see it is bridged we assume the bridge is VLAN-unaware. Not because it cannot be VLAN-aware but: - if we are transmitting from a VLAN-aware bridge we are likely doing so using TX forwarding offload. That code path guarantees that skbs have a vlan hwaccel tag in them, so we would not enter the "else" branch of the "if (skb->protocol == htons(ETH_P_8021Q))" condition. - if we are transmitting on behalf of a VLAN-aware bridge but with no TX forwarding offload (no PVT support, out of space in the PVT, whatever), we would indeed be transmitting with VLAN 4095 instead of the bridge device's pvid. However we would be injecting a "From CPU" frame, and the switch won't learn from that - it only learns from "Forward" frames. So it is inconsequential for address learning. And VLAN 4095 is absolutely enough for the frame to exit the switch, since we never remove that VLAN from any port. Fixes: 57e661aae6a8 ("net: dsa: mv88e6xxx: Link aggregation support") Reported-by: Tobias Waldekranz Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/linux/dsa/mv88e6xxx.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 include/linux/dsa/mv88e6xxx.h (limited to 'include/linux') diff --git a/include/linux/dsa/mv88e6xxx.h b/include/linux/dsa/mv88e6xxx.h new file mode 100644 index 000000000000..8c3d45eca46b --- /dev/null +++ b/include/linux/dsa/mv88e6xxx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright 2021 NXP + */ + +#ifndef _NET_DSA_TAG_MV88E6XXX_H +#define _NET_DSA_TAG_MV88E6XXX_H + +#include + +#define MV88E6XXX_VID_STANDALONE 0 +#define MV88E6XXX_VID_BRIDGED (VLAN_N_VID - 1) + +#endif -- cgit v1.2.3 From 8208461d3912e3e97e31bcbd4ce716e4a251a5dd Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Fri, 8 Oct 2021 15:24:27 +0300 Subject: net/mlx5: Add ifc bits to support optional counters Adding bth_opcode field and the relevant bits. This field will be used to capture and count congestion notification packets (CNP). Adding source_vhca_port support bit. This field will be used to check the capability to use the source_vhca_port as a match criteria in cases of dual port. Signed-off-by: Aharon Landau Reviewed-by: Maor Gottlieb Signed-off-by: Mark Zhang Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 96f5fb2af811..34254dbe7117 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -342,7 +342,7 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 outer_geneve_oam[0x1]; u8 outer_geneve_protocol_type[0x1]; u8 outer_geneve_opt_len[0x1]; - u8 reserved_at_1e[0x1]; + u8 source_vhca_port[0x1]; u8 source_eswitch_port[0x1]; u8 inner_dmac[0x1]; @@ -393,6 +393,14 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 metadata_reg_c_0[0x1]; }; +struct mlx5_ifc_flow_table_fields_supported_2_bits { + u8 reserved_at_0[0xe]; + u8 bth_opcode[0x1]; + u8 reserved_at_f[0x11]; + + u8 reserved_at_20[0x60]; +}; + struct mlx5_ifc_flow_table_prop_layout_bits { u8 ft_support[0x1]; u8 reserved_at_1[0x1]; @@ -539,7 +547,7 @@ struct mlx5_ifc_fte_match_set_misc_bits { union mlx5_ifc_gre_key_bits gre_key; u8 vxlan_vni[0x18]; - u8 reserved_at_b8[0x8]; + u8 bth_opcode[0x8]; u8 geneve_vni[0x18]; u8 reserved_at_d8[0x7]; @@ -756,7 +764,15 @@ struct mlx5_ifc_flow_table_nic_cap_bits { struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer; - u8 reserved_at_e00[0x1200]; + u8 reserved_at_e00[0x700]; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive_rdma; + + u8 reserved_at_1580[0x280]; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_transmit_rdma; + + u8 reserved_at_1880[0x780]; u8 sw_steering_nic_rx_action_drop_icm_address[0x40]; -- cgit v1.2.3 From b8dfed636fc6239396c3a2ae5f812505906cf215 Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Fri, 8 Oct 2021 15:24:28 +0300 Subject: net/mlx5: Add priorities for counters in RDMA namespaces Add additional flow steering priorities in the RDMA namespace. This allows adding flow counters to count filtered RDMA traffic and then continue processing in the regular RDMA steering flow. Signed-off-by: Aharon Landau Reviewed-by: Maor Gottlieb Signed-off-by: Mark Zhang Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 2 ++ include/linux/mlx5/fs.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 66eaf0aa7f69..ed0230ff9422 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1456,6 +1456,8 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; } +#define MLX5_RDMA_RX_NUM_COUNTERS_PRIOS 2 +#define MLX5_RDMA_TX_NUM_COUNTERS_PRIOS 1 #define MLX5_BY_PASS_NUM_REGULAR_PRIOS 16 #define MLX5_BY_PASS_NUM_DONT_TRAP_PRIOS 16 #define MLX5_BY_PASS_NUM_MULTICAST_PRIOS 1 diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 0106c67e8ccb..f2c3da2006d9 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -83,6 +83,8 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_RDMA_RX, MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL, MLX5_FLOW_NAMESPACE_RDMA_TX, + MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS, + MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS, }; enum { -- cgit v1.2.3 From 0392dd51f9c78d46109a408f27dc820300dcd8bd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Oct 2021 10:10:10 -0400 Subject: SUNRPC: Per-rpc_clnt task PIDs The current range of RPC task PIDs is 0..65535. That's not adequate for distinguishing tasks across multiple rpc_clnts running high throughput workloads. To help relieve this situation and to reduce the bottleneck of having a single atomic for assigning all RPC task PIDs, assign task PIDs per rpc_clnt. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index a4661646adc9..267b7aeaf1a6 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -40,6 +40,7 @@ struct rpc_clnt { unsigned int cl_clid; /* client id */ struct list_head cl_clients; /* Global list of clients */ struct list_head cl_tasks; /* List of tasks */ + atomic_t cl_pid; /* task PID counter */ spinlock_t cl_lock; /* spinlock */ struct rpc_xprt __rcu * cl_xprt; /* transport */ const struct rpc_procinfo *cl_procinfo; /* procedure info */ -- cgit v1.2.3 From 0199215216978b612d4e8be11e878b87bc643033 Mon Sep 17 00:00:00 2001 From: Juhee Kang Date: Sun, 10 Oct 2021 13:03:29 +0900 Subject: mlxsw: spectrum: use netif_is_macsec() instead of open code Open code which is dev->priv_flags & IFF_MACSEC has already defined as netif_is_macsec(). So use netif_is_macsec() instead of open code. This patch doesn't change logic. Signed-off-by: Juhee Kang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 15f4a658e436..0723c1314ea2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5237,7 +5237,7 @@ static inline void netif_keep_dst(struct net_device *dev) static inline bool netif_reduces_vlan_mtu(struct net_device *dev) { /* TODO: reserve and use an additional IFF bit, if we get more users */ - return dev->priv_flags & IFF_MACSEC; + return netif_is_macsec(dev); } extern struct pernet_operations __net_initdata loopback_net_ops; -- cgit v1.2.3 From bdac5c2b243f68ec15f8203c3348ae79fee8e8d8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 16 Sep 2021 15:23:20 +0900 Subject: bootconfig: Allocate xbc_data inside xbc_init() Allocate 'xbc_data' in the xbc_init() so that it does not need to care about the ownership of the copied data. Link: https://lkml.kernel.org/r/163177339986.682366.898762699429769117.stgit@devnote2 Suggested-by: Steven Rostedt Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 537e1b991f11..62e09b788172 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -271,7 +271,7 @@ static inline int __init xbc_node_compose_key(struct xbc_node *node, } /* XBC node initializer */ -int __init xbc_init(char *buf, const char **emsg, int *epos); +int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos); /* XBC cleanup data structures */ -- cgit v1.2.3 From e306220cb7b7c2948f191414ab06851e143b54c1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 16 Sep 2021 15:23:29 +0900 Subject: bootconfig: Add xbc_get_info() for the node information Add xbc_get_info() API which allows user to get the number of used xbc_nodes and the size of bootconfig data. This is also useful for checking the bootconfig is initialized or not. Link: https://lkml.kernel.org/r/163177340877.682366.4360676589783197627.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 62e09b788172..f955bb7eabbb 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -273,6 +273,8 @@ static inline int __init xbc_node_compose_key(struct xbc_node *node, /* XBC node initializer */ int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos); +/* XBC node and size information */ +int __init xbc_get_info(int *node_size, size_t *data_size); /* XBC cleanup data structures */ void __init xbc_destroy_all(void); -- cgit v1.2.3 From 115d4d08aeb942133d025a425dd611092893d774 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 17 Sep 2021 19:02:39 +0900 Subject: bootconfig: Rename xbc_destroy_all() to xbc_exit() Avoid using this noisy name and use more calm one. This is just a name change. No functional change. Link: https://lkml.kernel.org/r/163187295918.2366983.5231840238429996027.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index f955bb7eabbb..7eb7a7f8ade7 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -277,7 +277,7 @@ int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos); int __init xbc_get_info(int *node_size, size_t *data_size); /* XBC cleanup data structures */ -void __init xbc_destroy_all(void); +void __init xbc_exit(void); /* Debug dump functions */ void __init xbc_debug_dump(void); -- cgit v1.2.3 From 9b81c9bfff4651abb28bfa6d83c8b879e467963b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 17 Sep 2021 19:02:53 +0900 Subject: bootconfig: Remove unused debug function Remove unused xbc_debug_dump() from bootconfig for clean up the code. Link: https://lkml.kernel.org/r/163187297371.2366983.12943349701785875450.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 7eb7a7f8ade7..85cdfd381877 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -279,7 +279,4 @@ int __init xbc_get_info(int *node_size, size_t *data_size); /* XBC cleanup data structures */ void __init xbc_exit(void); -/* Debug dump functions */ -void __init xbc_debug_dump(void); - #endif -- cgit v1.2.3 From 4f292c4886bfdfc2a7191ef4ff3f7aac69d1cc3f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 17 Sep 2021 19:03:08 +0900 Subject: bootconfig: Replace u16 and u32 with uint16_t and uint32_t Replace u16 and u32 with uint16_t and uint32_t so that the tools/bootconfig only needs . Link: https://lkml.kernel.org/r/163187298835.2366983.9838262576854319669.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 85cdfd381877..a6f8dc51f168 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -25,10 +25,10 @@ * The checksum will be used with the BOOTCONFIG_MAGIC and the size for * embedding the bootconfig in the initrd image. */ -static inline __init u32 xbc_calc_checksum(void *data, u32 size) +static inline __init uint32_t xbc_calc_checksum(void *data, uint32_t size) { unsigned char *p = data; - u32 ret = 0; + uint32_t ret = 0; while (size--) ret += *p++; @@ -38,10 +38,10 @@ static inline __init u32 xbc_calc_checksum(void *data, u32 size) /* XBC tree node */ struct xbc_node { - u16 next; - u16 child; - u16 parent; - u16 data; + uint16_t next; + uint16_t child; + uint16_t parent; + uint16_t data; } __attribute__ ((__packed__)); #define XBC_KEY 0 -- cgit v1.2.3 From 4ee1b4cac236650979a5d9b745bb0a83efde3a46 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 17 Sep 2021 19:03:16 +0900 Subject: bootconfig: Cleanup dummy headers in tools/bootconfig Cleanup dummy headers in tools/bootconfig/include except for tools/bootconfig/include/linux/bootconfig.h. For this change, I use __KERNEL__ macro to split kernel header #include and introduce xbc_alloc_mem() and xbc_free_mem(). Link: https://lkml.kernel.org/r/163187299574.2366983.18371329724128746091.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index a6f8dc51f168..a4665c7ab07c 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -7,8 +7,18 @@ * Author: Masami Hiramatsu */ +#ifdef __KERNEL__ #include #include +#else /* !__KERNEL__ */ +/* + * NOTE: This is only for tools/bootconfig, because tools/bootconfig will + * run the parser sanity test. + * This does NOT mean linux/bootconfig.h is available in the user space. + * However, if you change this file, please make sure the tools/bootconfig + * has no issue on building and running. + */ +#endif #define BOOTCONFIG_MAGIC "#BOOTCONFIG\n" #define BOOTCONFIG_MAGIC_LEN 12 -- cgit v1.2.3 From bf2928c7a284e31f9f91a004b5dca09c522157c0 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 10 Sep 2021 08:22:06 +0200 Subject: PCI/VPD: Add pci_read/write_vpd_any() In certain cases we need a variant of pci_read_vpd()/pci_write_vpd() that does not check against dev->vpd.len. Such cases are: - Reading VPD if dev->vpd.len isn't set yet (in pci_vpd_size()) - Devices that map non-VPD information to arbitrary places in VPD address space (example: Chelsio T3 EEPROM write-protect flag) Therefore add pci_read_vpd_any() and pci_write_vpd_any() that check against PCI_VPD_MAX_SIZE only. Link: https://lore.kernel.org/r/93ecce28-a158-f02a-d134-8afcaced8efe@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Acked-by: Jakub Kicinski --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index cd8aa6fce204..9649bd9e468d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1350,6 +1350,8 @@ void pci_unlock_rescan_remove(void); /* Vital Product Data routines */ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); +ssize_t pci_read_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, void *buf); +ssize_t pci_write_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx); -- cgit v1.2.3 From f614fb60a1983819e83796198de2b607fba75e99 Mon Sep 17 00:00:00 2001 From: Wenbin Mei Date: Fri, 17 Sep 2021 20:48:02 +0800 Subject: mmc: core: Add host specific tuning support for eMMC HS400 mode This adds a ->execute_hs400_tuning() host callback to enable optional support for host specific tuning for eMMC HS400 mode. Additionally, share mmc_get_ext_csd() through the public host headerfile, to allow it to be used by the host drivers, which is needed to support the HS400 tuning. Signed-off-by: Wenbin Mei Link: https://lore.kernel.org/r/20210917124803.22871-3-wenbin.mei@mediatek.com Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 0c0c9a0fdf57..b6c082e2664c 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -162,6 +162,9 @@ struct mmc_host_ops { /* Prepare HS400 target operating frequency depending host driver */ int (*prepare_hs400_tuning)(struct mmc_host *host, struct mmc_ios *ios); + /* Execute HS400 tuning depending host driver */ + int (*execute_hs400_tuning)(struct mmc_host *host, struct mmc_card *card); + /* Prepare switch to DDR during the HS400 init sequence */ int (*hs400_prepare_ddr)(struct mmc_host *host); @@ -634,5 +637,6 @@ static inline enum dma_data_direction mmc_get_dma_dir(struct mmc_data *data) int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error); int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode); +int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd); #endif /* LINUX_MMC_HOST_H */ -- cgit v1.2.3 From 0ae93b99beb283438aa571a6add4eab0c077d576 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 7 Oct 2021 16:17:24 -0400 Subject: SUNRPC: Simplify the SVC dispatch code path Micro-optimization: The last user of the generic SVC dispatch code path has been removed, so svc_process_common() can be simplified. This declutters the hot path so that the by-far most common case (a dispatch function exists) is made the /only/ path. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 6263410c948a..4205a6ef4770 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -443,10 +443,7 @@ struct svc_version { /* Need xprt with congestion control */ bool vs_need_cong_ctrl; - /* Override dispatch function (e.g. when caching replies). - * A return value of 0 means drop the request. - * vs_dispatch == NULL means use default dispatcher. - */ + /* Dispatch function */ int (*vs_dispatch)(struct svc_rqst *, __be32 *); }; -- cgit v1.2.3 From 0bc73ad46a76ed6ece4dcacb28858e7b38561e1c Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 26 Sep 2021 17:55:41 +0300 Subject: net/mlx5e: Mutually exclude RX-FCS and RX-port-timestamp Due to current HW arch limitations, RX-FCS (scattering FCS frame field to software) and RX-port-timestamp (improved timestamp accuracy on the receive side) can't work together. RX-port-timestamp is not controlled by the user and it is enabled by default when supported by the HW/FW. This patch sets RX-port-timestamp opposite to RX-FCS configuration. Fixes: 102722fc6832 ("net/mlx5e: Add support for RXFCS feature flag") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f3638d09ba77..993204a6c1a1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9475,16 +9475,22 @@ struct mlx5_ifc_pcmr_reg_bits { u8 reserved_at_0[0x8]; u8 local_port[0x8]; u8 reserved_at_10[0x10]; + u8 entropy_force_cap[0x1]; u8 entropy_calc_cap[0x1]; u8 entropy_gre_calc_cap[0x1]; - u8 reserved_at_23[0x1b]; + u8 reserved_at_23[0xf]; + u8 rx_ts_over_crc_cap[0x1]; + u8 reserved_at_33[0xb]; u8 fcs_cap[0x1]; u8 reserved_at_3f[0x1]; + u8 entropy_force[0x1]; u8 entropy_calc[0x1]; u8 entropy_gre_calc[0x1]; - u8 reserved_at_43[0x1b]; + u8 reserved_at_43[0xf]; + u8 rx_ts_over_crc[0x1]; + u8 reserved_at_53[0xb]; u8 fcs_chk[0x1]; u8 reserved_at_5f[0x1]; }; -- cgit v1.2.3 From 8e9028b3790ddb02c407e1a4da0ab7cf82790e7e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 12 Oct 2021 15:42:59 -0500 Subject: PCI: Return NULL for to_pci_driver(NULL) to_pci_driver() takes a pointer to a struct device_driver and uses container_of() to find the struct pci_driver that contains it. If given a NULL pointer to a struct device_driver, return a NULL pci_driver pointer instead of applying container_of() to NULL. This simplifies callers that would otherwise have to check for a NULL pointer first. Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index cd8aa6fce204..a2d62165a7f7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -900,7 +900,10 @@ struct pci_driver { struct pci_dynids dynids; }; -#define to_pci_driver(drv) container_of(drv, struct pci_driver, driver) +static inline struct pci_driver *to_pci_driver(struct device_driver *drv) +{ + return drv ? container_of(drv, struct pci_driver, driver) : NULL; +} /** * PCI_DEVICE - macro used to describe a specific PCI device -- cgit v1.2.3 From 28da0555c3b542d605e4ca26eea6a740cf2c9174 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 22 Sep 2021 17:37:25 +0300 Subject: net: dsa: move sja1110_process_meta_tstamp inside the tagging protocol driver The problem is that DSA tagging protocols really must not depend on the switch driver, because this creates a circular dependency at insmod time, and the switch driver will effectively not load when the tagging protocol driver is missing. The code was structured in the way it was for a reason, though. The DSA driver-facing API for PTP timestamping relies on the assumption that two-step TX timestamps are provided by the hardware in an out-of-band manner, typically by raising an interrupt and making that timestamp available inside some sort of FIFO which is to be accessed over SPI/MDIO/etc. So the API puts .port_txtstamp into dsa_switch_ops, because it is expected that the switch driver needs to save some state (like put the skb into a queue until its TX timestamp arrives). On SJA1110, TX timestamps are provided by the switch as Ethernet packets, so this makes them be received and processed by the tagging protocol driver. This in itself is great, because the timestamps are full 64-bit and do not require reconstruction, and since Ethernet is the fastest I/O method available to/from the switch, PTP timestamps arrive very quickly, no matter how bottlenecked the SPI connection is, because SPI interaction is not needed at all. DSA's code structure and strict isolation between the tagging protocol driver and the switch driver break the natural code organization. When the tagging protocol driver receives a packet which is classified as a metadata packet containing timestamps, it passes those timestamps one by one to the switch driver, which then proceeds to compare them based on the recorded timestamp ID that was generated in .port_txtstamp. The communication between the tagging protocol and the switch driver is done through a method exported by the switch driver, sja1110_process_meta_tstamp. To satisfy build requirements, we force a dependency to build the tagging protocol driver as a module when the switch driver is a module. However, as explained in the first paragraph, that causes the circular dependency. To solve this, move the skb queue from struct sja1105_private :: struct sja1105_ptp_data to struct sja1105_private :: struct sja1105_tagger_data. The latter is a data structure for which hacks have already been put into place to be able to create persistent storage per switch that is accessible from the tagging protocol driver (see sja1105_setup_ports). With the skb queue directly accessible from the tagging protocol driver, we can now move sja1110_process_meta_tstamp into the tagging driver itself, and avoid exporting a symbol. Fixes: 566b18c8b752 ("net: dsa: sja1105: implement TX timestamping for SJA1110") Link: https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/ Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/linux/dsa/sja1105.h | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 171106202fe5..0485ab2fcc46 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -48,6 +48,10 @@ struct sja1105_tagger_data { spinlock_t meta_lock; unsigned long state; u8 ts_id; + /* Used on SJA1110 where meta frames are generated only for + * 2-step TX timestamps + */ + struct sk_buff_head skb_txtstamp_queue; }; struct sja1105_skb_cb { @@ -69,25 +73,20 @@ struct sja1105_port { bool hwts_tx_en; }; -enum sja1110_meta_tstamp { - SJA1110_META_TSTAMP_TX = 0, - SJA1110_META_TSTAMP_RX = 1, -}; - -#if IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) - -void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, u8 ts_id, - enum sja1110_meta_tstamp dir, u64 tstamp); - -#else +/* Timestamps are in units of 8 ns clock ticks (equivalent to + * a fixed 125 MHz clock). + */ +#define SJA1105_TICK_NS 8 -static inline void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, - u8 ts_id, enum sja1110_meta_tstamp dir, - u64 tstamp) +static inline s64 ns_to_sja1105_ticks(s64 ns) { + return ns / SJA1105_TICK_NS; } -#endif /* IS_ENABLED(CONFIG_NET_DSA_SJA1105_PTP) */ +static inline s64 sja1105_ticks_to_ns(s64 ticks) +{ + return ticks * SJA1105_TICK_NS; +} #if IS_ENABLED(CONFIG_NET_DSA_SJA1105) -- cgit v1.2.3 From 4ac0567e40b334b54988e3c28a2425ff9c8bdd35 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 22 Sep 2021 17:37:26 +0300 Subject: net: dsa: sja1105: break dependency between dsa_port_is_sja1105 and switch driver It's nice to be able to test a tagging protocol with dsa_loop, but not at the cost of losing the ability of building the tagging protocol and switch driver as modules, because as things stand, there is a circular dependency between the two. Tagging protocol drivers cannot depend on switch drivers, that is a hard fact. The reasoning behind the blamed patch was that accessing dp->priv should first make sure that the structure behind that pointer is what we really think it is. Currently the "sja1105" and "sja1110" tagging protocols only operate with the sja1105 switch driver, just like any other tagging protocol and switch combination. The only way to mix and match them is by modifying the code, and this applies to dsa_loop as well (by default that uses DSA_TAG_PROTO_NONE). So while in principle there is an issue, in practice there isn't one. Until we extend dsa_loop to allow user space configuration, treat the problem as a non-issue and just say that DSA ports found by tag_sja1105 are always sja1105 ports, which is in fact true. But keep the dsa_port_is_sja1105 function so that it's easy to patch it during testing, and rely on dead code elimination. Fixes: 994d2cbb08ca ("net: dsa: tag_sja1105: be dsa_loop-safe") Link: https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/ Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/linux/dsa/sja1105.h | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 0485ab2fcc46..9e07079528a5 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -88,22 +88,9 @@ static inline s64 sja1105_ticks_to_ns(s64 ticks) return ticks * SJA1105_TICK_NS; } -#if IS_ENABLED(CONFIG_NET_DSA_SJA1105) - -extern const struct dsa_switch_ops sja1105_switch_ops; - -static inline bool dsa_port_is_sja1105(struct dsa_port *dp) -{ - return dp->ds->ops == &sja1105_switch_ops; -} - -#else - static inline bool dsa_port_is_sja1105(struct dsa_port *dp) { - return false; + return true; } -#endif - #endif /* _NET_DSA_SJA1105_H */ -- cgit v1.2.3 From deab6b1cd9789bb9bd466d5e76aecb8b336259b4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 Oct 2021 14:40:40 +0300 Subject: net: dsa: tag_ocelot: break circular dependency with ocelot switch lib driver As explained here: https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/ DSA tagging protocol drivers cannot depend on symbols exported by switch drivers, because this creates a circular dependency that breaks module autoloading. The tag_ocelot.c file depends on the ocelot_ptp_rew_op() function exported by the common ocelot switch lib. This function looks at OCELOT_SKB_CB(skb) and computes how to populate the REW_OP field of the DSA tag, for PTP timestamping (the command: one-step/two-step, and the TX timestamp identifier). None of that requires deep insight into the driver, it is quite stateless, as it only depends upon the skb->cb. So let's make it a static inline function and put it in include/linux/dsa/ocelot.h, a file that despite its name is used by the ocelot switch driver for populating the injection header too - since commit 40d3f295b5fe ("net: mscc: ocelot: use common tag parsing code with DSA"). With that function declared as static inline, its body is expanded inside each call site, so the dependency is broken and the DSA tagger can be built without the switch library, upon which the felix driver depends. Fixes: 39e5308b3250 ("net: mscc: ocelot: support PTP Sync one-step timestamping") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/linux/dsa/ocelot.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index 435777a0073c..50641a7529ad 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -6,6 +6,26 @@ #define _NET_DSA_TAG_OCELOT_H #include +#include + +struct ocelot_skb_cb { + struct sk_buff *clone; + unsigned int ptp_class; /* valid only for clones */ + u8 ptp_cmd; + u8 ts_id; +}; + +#define OCELOT_SKB_CB(skb) \ + ((struct ocelot_skb_cb *)((skb)->cb)) + +#define IFH_TAG_TYPE_C 0 +#define IFH_TAG_TYPE_S 1 + +#define IFH_REW_OP_NOOP 0x0 +#define IFH_REW_OP_DSCP 0x1 +#define IFH_REW_OP_ONE_STEP_PTP 0x2 +#define IFH_REW_OP_TWO_STEP_PTP 0x3 +#define IFH_REW_OP_ORIGIN_PTP 0x5 #define OCELOT_TAG_LEN 16 #define OCELOT_SHORT_PREFIX_LEN 4 @@ -215,4 +235,21 @@ static inline void ocelot_ifh_set_vid(void *injection, u64 vid) packing(injection, &vid, 11, 0, OCELOT_TAG_LEN, PACK, 0); } +/* Determine the PTP REW_OP to use for injecting the given skb */ +static inline u32 ocelot_ptp_rew_op(struct sk_buff *skb) +{ + struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; + u8 ptp_cmd = OCELOT_SKB_CB(skb)->ptp_cmd; + u32 rew_op = 0; + + if (ptp_cmd == IFH_REW_OP_TWO_STEP_PTP && clone) { + rew_op = ptp_cmd; + rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; + } else if (ptp_cmd == IFH_REW_OP_ORIGIN_PTP) { + rew_op = ptp_cmd; + } + + return rew_op; +} + #endif -- cgit v1.2.3 From 49f885b2d97093451410e7279aa29d81e094e108 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 12 Oct 2021 14:40:41 +0300 Subject: net: dsa: tag_ocelot_8021q: break circular dependency with ocelot switch lib Michael reported that when using the "ocelot-8021q" tagging protocol, the switch driver module must be manually loaded before the tagging protocol can be loaded/is available. This appears to be the same problem described here: https://lore.kernel.org/netdev/20210908220834.d7gmtnwrorhharna@skbuf/ where due to the fact that DSA tagging protocols make use of symbols exported by the switch drivers, circular dependencies appear and this breaks module autoloading. The ocelot_8021q driver needs the ocelot_can_inject() and ocelot_port_inject_frame() functions from the switch library. Previously the wrong approach was taken to solve that dependency: shims were provided for the case where the ocelot switch library was compiled out, but that turns out to be insufficient, because the dependency when the switch lib _is_ compiled is problematic too. We cannot declare ocelot_can_inject() and ocelot_port_inject_frame() as static inline functions, because these access I/O functions like __ocelot_write_ix() which is called by ocelot_write_rix(). Making those static inline basically means exposing the whole guts of the ocelot switch library, not ideal... We already have one tagging protocol driver which calls into the switch driver during xmit but not using any exported symbol: sja1105_defer_xmit. We can do the same thing here: create a kthread worker and one work item per skb, and let the switch driver itself do the register accesses to send the skb, and then consume it. Fixes: 0a6f17c6ae21 ("net: dsa: tag_ocelot_8021q: add support for PTP timestamping") Reported-by: Michael Walle Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/linux/dsa/ocelot.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index 50641a7529ad..8ae999f587c4 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -5,6 +5,7 @@ #ifndef _NET_DSA_TAG_OCELOT_H #define _NET_DSA_TAG_OCELOT_H +#include #include #include @@ -160,6 +161,17 @@ struct ocelot_skb_cb { * +------+------+------+------+------+------+------+------+ */ +struct felix_deferred_xmit_work { + struct dsa_port *dp; + struct sk_buff *skb; + struct kthread_work work; +}; + +struct felix_port { + void (*xmit_work_fn)(struct kthread_work *work); + struct kthread_worker *xmit_worker; +}; + static inline void ocelot_xfh_get_rew_val(void *extraction, u64 *rew_val) { packing(extraction, rew_val, 116, 85, OCELOT_TAG_LEN, UNPACK, 0); -- cgit v1.2.3 From 5008062f1c3f5af3acf86164aa6fcc77b0c7bdce Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Wed, 13 Oct 2021 14:19:56 +0100 Subject: nvmem: core: add nvmem cell post processing callback Some NVMEM providers have certain nvmem cells encoded, which requires post processing before actually using it. For example mac-address is stored in either in ascii or delimited or reverse-order. Having a post-process callback hook to provider drivers would enable them to do this vendor specific post processing before nvmem consumers see it. Tested-by: Joakim Zhang Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20211013131957.30271-3-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-provider.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 104505e9028f..98efb7b5660d 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -19,6 +19,9 @@ typedef int (*nvmem_reg_read_t)(void *priv, unsigned int offset, void *val, size_t bytes); typedef int (*nvmem_reg_write_t)(void *priv, unsigned int offset, void *val, size_t bytes); +/* used for vendor specific post processing of cell data */ +typedef int (*nvmem_cell_post_process_t)(void *priv, const char *id, unsigned int offset, + void *buf, size_t bytes); enum nvmem_type { NVMEM_TYPE_UNKNOWN = 0, @@ -62,6 +65,7 @@ struct nvmem_keepout { * @no_of_node: Device should not use the parent's of_node even if it's !NULL. * @reg_read: Callback to read data. * @reg_write: Callback to write data. + * @cell_post_process: Callback for vendor specific post processing of cell data * @size: Device size. * @word_size: Minimum read/write access granularity. * @stride: Minimum read/write access stride. @@ -92,6 +96,7 @@ struct nvmem_config { bool no_of_node; nvmem_reg_read_t reg_read; nvmem_reg_write_t reg_write; + nvmem_cell_post_process_t cell_post_process; int size; int word_size; int stride; -- cgit v1.2.3 From 16c663642c7ec03cd4cee5fec520bb69e97babe4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 12 Oct 2021 11:57:22 -0400 Subject: SUNRPC: Replace the "__be32 *p" parameter to .pc_decode The passed-in value of the "__be32 *p" parameter is now unused in every server-side XDR decoder, and can be removed. Note also that there is a line in each decoder that sets up a local pointer to a struct xdr_stream. Passing that pointer from the dispatcher instead saves one line per decoder function. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/lockd/xdr.h | 19 ++++++++++--------- include/linux/lockd/xdr4.h | 19 +++++++++---------- include/linux/sunrpc/svc.h | 3 ++- 3 files changed, 21 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index a98309c0121c..170ad6f5596a 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -96,18 +96,19 @@ struct nlm_reboot { */ #define NLMSVC_XDRSIZE sizeof(struct nlm_args) -int nlmsvc_decode_testargs(struct svc_rqst *, __be32 *); +int nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); + int nlmsvc_encode_testres(struct svc_rqst *, __be32 *); -int nlmsvc_decode_lockargs(struct svc_rqst *, __be32 *); -int nlmsvc_decode_cancargs(struct svc_rqst *, __be32 *); -int nlmsvc_decode_unlockargs(struct svc_rqst *, __be32 *); int nlmsvc_encode_res(struct svc_rqst *, __be32 *); -int nlmsvc_decode_res(struct svc_rqst *, __be32 *); int nlmsvc_encode_void(struct svc_rqst *, __be32 *); -int nlmsvc_decode_void(struct svc_rqst *, __be32 *); -int nlmsvc_decode_shareargs(struct svc_rqst *, __be32 *); int nlmsvc_encode_shareres(struct svc_rqst *, __be32 *); -int nlmsvc_decode_notify(struct svc_rqst *, __be32 *); -int nlmsvc_decode_reboot(struct svc_rqst *, __be32 *); #endif /* LOCKD_XDR_H */ diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h index 5ae766f26e04..68e14e0f2b1f 100644 --- a/include/linux/lockd/xdr4.h +++ b/include/linux/lockd/xdr4.h @@ -22,21 +22,20 @@ #define nlm4_fbig cpu_to_be32(NLM_FBIG) #define nlm4_failed cpu_to_be32(NLM_FAILED) +int nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); - -int nlm4svc_decode_testargs(struct svc_rqst *, __be32 *); int nlm4svc_encode_testres(struct svc_rqst *, __be32 *); -int nlm4svc_decode_lockargs(struct svc_rqst *, __be32 *); -int nlm4svc_decode_cancargs(struct svc_rqst *, __be32 *); -int nlm4svc_decode_unlockargs(struct svc_rqst *, __be32 *); int nlm4svc_encode_res(struct svc_rqst *, __be32 *); -int nlm4svc_decode_res(struct svc_rqst *, __be32 *); int nlm4svc_encode_void(struct svc_rqst *, __be32 *); -int nlm4svc_decode_void(struct svc_rqst *, __be32 *); -int nlm4svc_decode_shareargs(struct svc_rqst *, __be32 *); int nlm4svc_encode_shareres(struct svc_rqst *, __be32 *); -int nlm4svc_decode_notify(struct svc_rqst *, __be32 *); -int nlm4svc_decode_reboot(struct svc_rqst *, __be32 *); extern const struct rpc_version nlm_version4; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4205a6ef4770..da3c5bc43d85 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -454,7 +454,8 @@ struct svc_procedure { /* process the request: */ __be32 (*pc_func)(struct svc_rqst *); /* XDR decode args: */ - int (*pc_decode)(struct svc_rqst *, __be32 *data); + int (*pc_decode)(struct svc_rqst *rqstp, + struct xdr_stream *xdr); /* XDR encode result: */ int (*pc_encode)(struct svc_rqst *, __be32 *data); /* XDR free result: */ -- cgit v1.2.3 From c44b31c263798ec34614dd394c31ef1a2e7e716e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 12 Oct 2021 11:57:28 -0400 Subject: SUNRPC: Change return value type of .pc_decode Returning an undecorated integer is an age-old trope, but it's not clear (even to previous experts in this code) that the only valid return values are 1 and 0. These functions do not return a negative errno, rpc_stat value, or a positive length. Document there are only two valid return values by having .pc_decode return only true or false. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/lockd/xdr.h | 18 +++++++++--------- include/linux/lockd/xdr4.h | 18 +++++++++--------- include/linux/sunrpc/svc.h | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index 170ad6f5596a..e1362244f909 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -96,15 +96,15 @@ struct nlm_reboot { */ #define NLMSVC_XDRSIZE sizeof(struct nlm_args) -int nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); int nlmsvc_encode_testres(struct svc_rqst *, __be32 *); int nlmsvc_encode_res(struct svc_rqst *, __be32 *); diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h index 68e14e0f2b1f..376b8f6a3763 100644 --- a/include/linux/lockd/xdr4.h +++ b/include/linux/lockd/xdr4.h @@ -22,15 +22,15 @@ #define nlm4_fbig cpu_to_be32(NLM_FBIG) #define nlm4_failed cpu_to_be32(NLM_FAILED) -int nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); int nlm4svc_encode_testres(struct svc_rqst *, __be32 *); int nlm4svc_encode_res(struct svc_rqst *, __be32 *); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index da3c5bc43d85..d6109fa7a57b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -454,7 +454,7 @@ struct svc_procedure { /* process the request: */ __be32 (*pc_func)(struct svc_rqst *); /* XDR decode args: */ - int (*pc_decode)(struct svc_rqst *rqstp, + bool (*pc_decode)(struct svc_rqst *rqstp, struct xdr_stream *xdr); /* XDR encode result: */ int (*pc_encode)(struct svc_rqst *, __be32 *data); -- cgit v1.2.3 From fda494411485aff91768842c532f90fb8eb54943 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 13 Oct 2021 10:41:06 -0400 Subject: SUNRPC: Replace the "__be32 *p" parameter to .pc_encode The passed-in value of the "__be32 *p" parameter is now unused in every server-side XDR encoder, and can be removed. Note also that there is a line in each encoder that sets up a local pointer to a struct xdr_stream. Passing that pointer from the dispatcher instead saves one line per encoder function. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/lockd/xdr.h | 8 ++++---- include/linux/lockd/xdr4.h | 8 ++++---- include/linux/sunrpc/svc.h | 3 ++- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index e1362244f909..d8bd26a5525e 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -106,9 +106,9 @@ bool nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_encode_testres(struct svc_rqst *, __be32 *); -int nlmsvc_encode_res(struct svc_rqst *, __be32 *); -int nlmsvc_encode_void(struct svc_rqst *, __be32 *); -int nlmsvc_encode_shareres(struct svc_rqst *, __be32 *); +int nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); #endif /* LOCKD_XDR_H */ diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h index 376b8f6a3763..50677be3557d 100644 --- a/include/linux/lockd/xdr4.h +++ b/include/linux/lockd/xdr4.h @@ -32,10 +32,10 @@ bool nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_encode_testres(struct svc_rqst *, __be32 *); -int nlm4svc_encode_res(struct svc_rqst *, __be32 *); -int nlm4svc_encode_void(struct svc_rqst *, __be32 *); -int nlm4svc_encode_shareres(struct svc_rqst *, __be32 *); +int nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +int nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); extern const struct rpc_version nlm_version4; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index d6109fa7a57b..85694cd0db66 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -457,7 +457,8 @@ struct svc_procedure { bool (*pc_decode)(struct svc_rqst *rqstp, struct xdr_stream *xdr); /* XDR encode result: */ - int (*pc_encode)(struct svc_rqst *, __be32 *data); + int (*pc_encode)(struct svc_rqst *rqstp, + struct xdr_stream *xdr); /* XDR free result: */ void (*pc_release)(struct svc_rqst *); unsigned int pc_argsize; /* argument struct size */ -- cgit v1.2.3 From 130e2054d4a652a2bd79fb1557ddcd19c053cb37 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 13 Oct 2021 10:41:13 -0400 Subject: SUNRPC: Change return value type of .pc_encode Returning an undecorated integer is an age-old trope, but it's not clear (even to previous experts in this code) that the only valid return values are 1 and 0. These functions do not return a negative errno, rpc_stat value, or a positive length. Document there are only two valid return values by having .pc_encode return only true or false. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/lockd/xdr.h | 8 ++++---- include/linux/lockd/xdr4.h | 8 ++++---- include/linux/sunrpc/svc.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index d8bd26a5525e..398f70093cd3 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -106,9 +106,9 @@ bool nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); #endif /* LOCKD_XDR_H */ diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h index 50677be3557d..9a6b55da8fd6 100644 --- a/include/linux/lockd/xdr4.h +++ b/include/linux/lockd/xdr4.h @@ -32,10 +32,10 @@ bool nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); -int nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr); +bool nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr); extern const struct rpc_version nlm_version4; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 85694cd0db66..0ae28ae6caf2 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -457,7 +457,7 @@ struct svc_procedure { bool (*pc_decode)(struct svc_rqst *rqstp, struct xdr_stream *xdr); /* XDR encode result: */ - int (*pc_encode)(struct svc_rqst *rqstp, + bool (*pc_encode)(struct svc_rqst *rqstp, struct xdr_stream *xdr); /* XDR free result: */ void (*pc_release)(struct svc_rqst *); -- cgit v1.2.3 From 40af35fdf79c77e19c597e47cc8fe9a8e200de30 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 12 Oct 2021 09:06:32 -0700 Subject: netdevice: demote the type of some dev_addr_set() helpers __dev_addr_set() and dev_addr_mod() and pretty low level, let the arguments be void, there's no chance for confusion in callers converted to use them. Keep u8 in dev_addr_set() because some of the callers are converted from a loop and we want to make sure assignments are not from an array of a different type. Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0723c1314ea2..f33af341bfb2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4643,7 +4643,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ static inline void -__dev_addr_set(struct net_device *dev, const u8 *addr, size_t len) +__dev_addr_set(struct net_device *dev, const void *addr, size_t len) { memcpy(dev->dev_addr, addr, len); } @@ -4655,7 +4655,7 @@ static inline void dev_addr_set(struct net_device *dev, const u8 *addr) static inline void dev_addr_mod(struct net_device *dev, unsigned int offset, - const u8 *addr, size_t len) + const void *addr, size_t len) { memcpy(&dev->dev_addr[offset], addr, len); } -- cgit v1.2.3 From 06de2cd788bfa3b51137e9bd471d68029ab68103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 17:39:27 +0200 Subject: gpio: max730x: Make __max730x_remove() return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An spi or i2c remove callback is only called for devices that probed successfully. In this case this implies that __max730x_probe() set a non-NULL driver data. So the check ts == NULL is never true. With this check dropped, __max730x_remove() returns zero unconditionally. Make it return void instead which makes it easier to see in the callers that there is no error to handle. Also the return value of i2c and spi remove callbacks is ignored anyway. Signed-off-by: Uwe Kleine-König Signed-off-by: Bartosz Golaszewski --- include/linux/spi/max7301.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/max7301.h b/include/linux/spi/max7301.h index 21449067aedb..e392c53758bc 100644 --- a/include/linux/spi/max7301.h +++ b/include/linux/spi/max7301.h @@ -31,6 +31,6 @@ struct max7301_platform_data { u32 input_pullup_active; }; -extern int __max730x_remove(struct device *dev); +extern void __max730x_remove(struct device *dev); extern int __max730x_probe(struct max7301 *ts); #endif -- cgit v1.2.3 From 6312d52838b21f5c4a5afa1269a00df4364fd354 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Wed, 13 Oct 2021 15:57:43 +0200 Subject: marvell: octeontx2: build error: unknown type name 'u64' Building an allmodconfig kernel arm64 kernel, the following build error shows up: In file included from drivers/crypto/marvell/octeontx2/cn10k_cpt.c:4: include/linux/soc/marvell/octeontx2/asm.h:38:15: error: unknown type name 'u64' 38 | static inline u64 otx2_atomic64_fetch_add(u64 incr, u64 *ptr) | ^~~ Include linux/types.h in asm.h so the compiler knows what the type 'u64' are. Fixes: af3826db74d1 ("octeontx2-pf: Use hardware register for CQE count") Signed-off-by: Anders Roxell Link: https://lore.kernel.org/r/20211013135743.3826594-1-anders.roxell@linaro.org Signed-off-by: Jakub Kicinski --- include/linux/soc/marvell/octeontx2/asm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/soc/marvell/octeontx2/asm.h b/include/linux/soc/marvell/octeontx2/asm.h index 0f79fd7f81a1..d683251a0b40 100644 --- a/include/linux/soc/marvell/octeontx2/asm.h +++ b/include/linux/soc/marvell/octeontx2/asm.h @@ -5,6 +5,7 @@ #ifndef __SOC_OTX2_ASM_H #define __SOC_OTX2_ASM_H +#include #if defined(CONFIG_ARM64) /* * otx2_lmt_flush is used for LMT store operation. -- cgit v1.2.3 From 78b727d0281507fabf954573420790b21e34aefe Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Fri, 17 Sep 2021 11:04:34 +0800 Subject: clk: qcom: smd-rpm: Add QCM2290 RPM clock support Add support for RPM-managed clocks on the QCM2290 platform. Signed-off-by: Shawn Guo Link: https://lore.kernel.org/r/20210917030434.19859-4-shawn.guo@linaro.org Reviewed-by: Bjorn Andersson Signed-off-by: Stephen Boyd --- include/linux/soc/qcom/smd-rpm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/smd-rpm.h b/include/linux/soc/qcom/smd-rpm.h index 60e66fc9b6bf..860dd8cdf9f3 100644 --- a/include/linux/soc/qcom/smd-rpm.h +++ b/include/linux/soc/qcom/smd-rpm.h @@ -38,6 +38,8 @@ struct qcom_smd_rpm; #define QCOM_SMD_RPM_IPA_CLK 0x617069 #define QCOM_SMD_RPM_CE_CLK 0x6563 #define QCOM_SMD_RPM_AGGR_CLK 0x72676761 +#define QCOM_SMD_RPM_HWKM_CLK 0x6d6b7768 +#define QCOM_SMD_RPM_PKA_CLK 0x616b70 int qcom_rpm_smd_write(struct qcom_smd_rpm *rpm, int state, -- cgit v1.2.3 From 9974cb5c879048f5144d0660c4932d98176213c4 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Wed, 13 Oct 2021 17:47:02 +0800 Subject: net: delete redundant function declaration The implement of function netdev_all_upper_get_next_dev_rcu has been removed in: commit f1170fd462c6 ("net: Remove all_adj_list and its references") so delete redundant declaration in header file. Fixes: f1170fd462c6 ("net: Remove all_adj_list and its references") Signed-off-by: Chen Wandun Link: https://lore.kernel.org/r/20211013094702.3931071-1-chenwandun@huawei.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f33af341bfb2..173984414f38 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4801,8 +4801,6 @@ struct netdev_nested_priv { bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); -struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, - struct list_head **iter); #ifdef CONFIG_LOCKDEP static LIST_HEAD(net_unlink_list); -- cgit v1.2.3 From f0ada6da3a0d69682e21f1783d02676e0fbf1bc1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 13 Oct 2021 17:37:07 +0300 Subject: device property: Add missed header in fwnode.h When adding some stuff to the header file we must not rely on implicit dependencies that are happen by luck or bugs in other headers. Hence fwnode.h needs to use bits.h directly. Fixes: c2c724c868c4 ("driver core: Add fw_devlink_parse_fwtree()") Cc: Saravana Kannan Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211013143707.80222-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 9f4ad719bfe3..3a532ba66f6c 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -11,6 +11,7 @@ #include #include +#include #include struct fwnode_operations; -- cgit v1.2.3 From 5de62ea84abd732ded7c5569426fd71c0420f83e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Sep 2021 22:16:02 +0200 Subject: sched,livepatch: Use wake_up_if_idle() Make sure to prod idle CPUs so they call klp_update_patch_state(). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Acked-by: Vasily Gorbik Tested-by: Petr Mladek Tested-by: Vasily Gorbik # on s390 Link: https://lkml.kernel.org/r/20210929151723.162004989@infradead.org --- include/linux/sched/idle.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index 22873d276be6..d73d314d59c6 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -11,7 +11,11 @@ enum cpu_idle_type { CPU_MAX_IDLE_TYPES }; +#ifdef CONFIG_SMP extern void wake_up_if_idle(int cpu); +#else +static inline void wake_up_if_idle(int cpu) { } +#endif /* * Idle thread specific functions to determine the need_resched -- cgit v1.2.3 From 4ef0c5c6b5ba1f38f0ea1cedad0cad722f00c14a Mon Sep 17 00:00:00 2001 From: Zhang Qiao Date: Wed, 15 Sep 2021 14:40:30 +0800 Subject: kernel/sched: Fix sched_fork() access an invalid sched_task_group There is a small race between copy_process() and sched_fork() where child->sched_task_group point to an already freed pointer. parent doing fork() | someone moving the parent | to another cgroup -------------------------------+------------------------------- copy_process() + dup_task_struct()<1> parent move to another cgroup, and free the old cgroup. <2> + sched_fork() + __set_task_cpu()<3> + task_fork_fair() + sched_slice()<4> In the worst case, this bug can lead to "use-after-free" and cause panic as shown above: (1) parent copy its sched_task_group to child at <1>; (2) someone move the parent to another cgroup and free the old cgroup at <2>; (3) the sched_task_group and cfs_rq that belong to the old cgroup will be accessed at <3> and <4>, which cause a panic: [] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [] PGD 8000001fa0a86067 P4D 8000001fa0a86067 PUD 2029955067 PMD 0 [] Oops: 0000 [#1] SMP PTI [] CPU: 7 PID: 648398 Comm: ebizzy Kdump: loaded Tainted: G OE --------- - - 4.18.0.x86_64+ #1 [] RIP: 0010:sched_slice+0x84/0xc0 [] Call Trace: [] task_fork_fair+0x81/0x120 [] sched_fork+0x132/0x240 [] copy_process.part.5+0x675/0x20e0 [] ? __handle_mm_fault+0x63f/0x690 [] _do_fork+0xcd/0x3b0 [] do_syscall_64+0x5d/0x1d0 [] entry_SYSCALL_64_after_hwframe+0x65/0xca [] RIP: 0033:0x7f04418cd7e1 Between cgroup_can_fork() and cgroup_post_fork(), the cgroup membership and thus sched_task_group can't change. So update child's sched_task_group at sched_post_fork() and move task_fork() and __set_task_cpu() (where accees the sched_task_group) from sched_fork() to sched_post_fork(). Fixes: 8323f26ce342 ("sched: Fix race in task_group") Signed-off-by: Zhang Qiao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Link: https://lkml.kernel.org/r/20210915064030.2231-1-zhangqiao22@huawei.com --- include/linux/sched/task.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ef02be869cf2..ba88a6987400 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -54,7 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p); +extern void sched_post_fork(struct task_struct *p, + struct kernel_clone_args *kargs); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); -- cgit v1.2.3 From 804bccba71a57e7e5deb507a4c8ebbab730909c0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Sep 2021 19:54:50 -0700 Subject: sched: Fill unconditional hole induced by sched_entity With struct sched_entity before the other sched entities, its alignment won't induce a struct hole. This saves 64 bytes in defconfig task_struct: Before: ... unsigned int rt_priority; /* 120 4 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ const struct sched_class * sched_class; /* 128 8 */ /* XXX 56 bytes hole, try to pack */ /* --- cacheline 3 boundary (192 bytes) --- */ struct sched_entity se __attribute__((__aligned__(64))); /* 192 448 */ /* --- cacheline 10 boundary (640 bytes) --- */ struct sched_rt_entity rt; /* 640 48 */ struct sched_dl_entity dl __attribute__((__aligned__(8))); /* 688 224 */ /* --- cacheline 14 boundary (896 bytes) was 16 bytes ago --- */ After: ... unsigned int rt_priority; /* 120 4 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ struct sched_entity se __attribute__((__aligned__(64))); /* 128 448 */ /* --- cacheline 9 boundary (576 bytes) --- */ struct sched_rt_entity rt; /* 576 48 */ struct sched_dl_entity dl __attribute__((__aligned__(8))); /* 624 224 */ /* --- cacheline 13 boundary (832 bytes) was 16 bytes ago --- */ Summary diff: - /* size: 7040, cachelines: 110, members: 188 */ + /* size: 6976, cachelines: 109, members: 188 */ Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210924025450.4138503-1-keescook@chromium.org --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 193e16e2d0e4..343603f77f8b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -775,10 +775,10 @@ struct task_struct { int normal_prio; unsigned int rt_priority; - const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; + const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; -- cgit v1.2.3 From e9bdcdbf6936dd1fbf419e00222dc9038b7812c2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 11 Oct 2021 15:32:44 +0200 Subject: pid: add pidfd_get_task() helper The number of system calls making use of pidfds is constantly increasing. Some of those new system calls duplicate the code to turn a pidfd into task_struct it refers to. Give them a simple helper for this. Link: https://lore.kernel.org/r/20211004125050.1153693-2-christian.brauner@ubuntu.com Link: https://lore.kernel.org/r/20211011133245.1703103-2-brauner@kernel.org Cc: Vlastimil Babka Cc: Suren Baghdasaryan Cc: Matthew Bobrowski Cc: Alexander Duyck Cc: David Hildenbrand Cc: Jan Kara Cc: Minchan Kim Reviewed-by: Matthew Bobrowski Acked-by: David Hildenbrand Signed-off-by: Christian Brauner --- include/linux/pid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index af308e15f174..343abf22092e 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -78,6 +78,7 @@ struct file; extern struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); +struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); int pidfd_create(struct pid *pid, unsigned int flags); static inline struct pid *get_pid(struct pid *pid) -- cgit v1.2.3 From 6098475d4cb48d821bdf453c61118c56e26294f0 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 8 Oct 2021 14:31:57 +0100 Subject: spi: Fix deadlock when adding SPI controllers on SPI buses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we have a global spi_add_lock which we take when adding new devices so that we can check that we're not trying to reuse a chip select that's already controlled. This means that if the SPI device is itself a SPI controller and triggers the instantiation of further SPI devices we trigger a deadlock as we try to register and instantiate those devices while in the process of doing so for the parent controller and hence already holding the global spi_add_lock. Since we only care about concurrency within a single SPI bus move the lock to be per controller, avoiding the deadlock. This can be easily triggered in the case of spi-mux. Reported-by: Uwe Kleine-König Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 8371bca13729..6b0b686f6f90 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -531,6 +531,9 @@ struct spi_controller { /* I/O mutex */ struct mutex io_mutex; + /* Used to avoid adding the same CS twice */ + struct mutex add_lock; + /* lock and mutex for SPI bus locking */ spinlock_t bus_lock_spinlock; struct mutex bus_lock_mutex; -- cgit v1.2.3 From 9f37ab0412eba537377c38b1dde1a04fbd7b5264 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 14 Oct 2021 14:18:59 +0000 Subject: PCI/switchtec: Add check of event support Not all events are supported by every gen/variant of the Switchtec firmware. To solve this, since Gen4, a new bit in each event header is introduced to indicate if an event is supported by the firmware. Link: https://lore.kernel.org/r/20211014141859.11444-6-kelvin.cao@microchip.com Signed-off-by: Logan Gunthorpe Signed-off-by: Kelvin Cao Signed-off-by: Bjorn Helgaas --- include/linux/switchtec.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h index 082f1d51957a..be24056ac00f 100644 --- a/include/linux/switchtec.h +++ b/include/linux/switchtec.h @@ -19,6 +19,7 @@ #define SWITCHTEC_EVENT_EN_CLI BIT(2) #define SWITCHTEC_EVENT_EN_IRQ BIT(3) #define SWITCHTEC_EVENT_FATAL BIT(4) +#define SWITCHTEC_EVENT_NOT_SUPP BIT(31) #define SWITCHTEC_DMA_MRPC_EN BIT(0) -- cgit v1.2.3 From 766607570becbd26cab6d66a544dd8d0d964df5a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 14 Oct 2021 07:24:31 -0700 Subject: ethernet: constify references to netdev->dev_addr in drivers This big patch sprinkles const on local variables and function arguments which may refer to netdev->dev_addr. Commit 406f42fa0d3c ("net-next: When a bond have a massive amount of VLANs...") introduced a rbtree for faster Ethernet address look up. To maintain netdev->dev_addr in this tree we need to make all the writes to it got through appropriate helpers. Some of the changes here are not strictly required - const is sometimes cast off but pointer is not used for writing. It seems like it's still better to add the const in case the code changes later or relevant -W flags get enabled for the build. No functional changes. Link: https://lore.kernel.org/r/20211014142432.449314-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/qed/qed_eth_if.h | 2 +- include/linux/qed/qed_if.h | 2 +- include/linux/qed/qed_rdma_if.h | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 4df0bf0a0864..e1bf3219b4e6 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -331,7 +331,7 @@ struct qed_eth_ops { int (*configure_arfs_searcher)(struct qed_dev *cdev, enum qed_filter_config_mode mode); int (*get_coalesce)(struct qed_dev *cdev, u16 *coal, void *handle); - int (*req_bulletin_update_mac)(struct qed_dev *cdev, u8 *mac); + int (*req_bulletin_update_mac)(struct qed_dev *cdev, const u8 *mac); }; const struct qed_eth_ops *qed_get_eth_ops(void); diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index ad220d5da18f..0dae7fcc5ef2 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -1113,7 +1113,7 @@ struct qed_common_ops { * * Return: Int. */ - int (*update_mac)(struct qed_dev *cdev, u8 *mac); + int (*update_mac)(struct qed_dev *cdev, const u8 *mac); /** * update_mtu(): API to inform the change in the mtu. diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index aeb242cefebf..3b76c07fbcf8 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -662,7 +662,8 @@ struct qed_rdma_ops { u8 connection_handle, struct qed_ll2_stats *p_stats); int (*ll2_set_mac_filter)(struct qed_dev *cdev, - u8 *old_mac_address, u8 *new_mac_address); + u8 *old_mac_address, + const u8 *new_mac_address); int (*iwarp_set_engine_affin)(struct qed_dev *cdev, bool b_reset); -- cgit v1.2.3 From 54f2d8d6ca99a3e24efc9c8b055bbcbe0b583227 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 13 Oct 2021 13:44:30 -0700 Subject: ethernet: make eth_hw_addr_random() use dev_addr_set() Commit 406f42fa0d3c ("net-next: When a bond have a massive amount of VLANs...") introduced a rbtree for faster Ethernet address look up. To maintain netdev->dev_addr in this tree we need to make all the writes to it got through appropriate helpers. Signed-off-by: Jakub Kicinski --- include/linux/etherdevice.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 3cf546d2ffd1..76f7ff684cbf 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -269,8 +269,11 @@ static inline void eth_zero_addr(u8 *addr) */ static inline void eth_hw_addr_random(struct net_device *dev) { + u8 addr[ETH_ALEN]; + + eth_random_addr(addr); + __dev_addr_set(dev, addr, ETH_ALEN); dev->addr_assign_type = NET_ADDR_RANDOM; - eth_random_addr(dev->dev_addr); } /** -- cgit v1.2.3 From ba530fea8ca1b57ee71d4e62f287a5d7ed92f789 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 13 Oct 2021 13:54:50 -0700 Subject: ethernet: remove random_ether_addr() random_ether_addr() was the original name of the helper which was kept for backward compatibility (?) after the rename in commit 0a4dd594982a ("etherdevice: Rename random_ether_addr to eth_random_addr"). We have a single random_ether_addr() caller left in tree while there are 70 callers of eth_random_addr(). Time to drop this define. Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20211013205450.328092-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/etherdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 76f7ff684cbf..23681c3d3b8a 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -234,8 +234,6 @@ static inline void eth_random_addr(u8 *addr) addr[0] |= 0x02; /* set local assignment bit (IEEE802) */ } -#define random_ether_addr(addr) eth_random_addr(addr) - /** * eth_broadcast_addr - Assign broadcast address * @addr: Pointer to a six-byte array containing the Ethernet address -- cgit v1.2.3 From 7463acfbe52ae8b7e0ea6890c1886b3f8ba8bddd Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Fri, 8 Oct 2021 22:06:01 +0200 Subject: netfilter: Rename ingress hook include file Prepare for addition of a netfilter egress hook by renaming to . The egress hook also necessitates a refactoring of the include file, but that is done in a separate commit to ease reviewing. No functional change intended. Signed-off-by: Lukas Wunner Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ingress.h | 58 --------------------------------------- include/linux/netfilter_netdev.h | 58 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 58 deletions(-) delete mode 100644 include/linux/netfilter_ingress.h create mode 100644 include/linux/netfilter_netdev.h (limited to 'include/linux') diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h deleted file mode 100644 index a13774be2eb5..000000000000 --- a/include/linux/netfilter_ingress.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NETFILTER_INGRESS_H_ -#define _NETFILTER_INGRESS_H_ - -#include -#include - -#ifdef CONFIG_NETFILTER_INGRESS -static inline bool nf_hook_ingress_active(const struct sk_buff *skb) -{ -#ifdef CONFIG_JUMP_LABEL - if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) - return false; -#endif - return rcu_access_pointer(skb->dev->nf_hooks_ingress); -} - -/* caller must hold rcu_read_lock */ -static inline int nf_hook_ingress(struct sk_buff *skb) -{ - struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress); - struct nf_hook_state state; - int ret; - - /* Must recheck the ingress hook head, in the event it became NULL - * after the check in nf_hook_ingress_active evaluated to true. - */ - if (unlikely(!e)) - return 0; - - nf_hook_state_init(&state, NF_NETDEV_INGRESS, - NFPROTO_NETDEV, skb->dev, NULL, NULL, - dev_net(skb->dev), NULL); - ret = nf_hook_slow(skb, &state, e, 0); - if (ret == 0) - return -1; - - return ret; -} - -static inline void nf_hook_ingress_init(struct net_device *dev) -{ - RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); -} -#else /* CONFIG_NETFILTER_INGRESS */ -static inline int nf_hook_ingress_active(struct sk_buff *skb) -{ - return 0; -} - -static inline int nf_hook_ingress(struct sk_buff *skb) -{ - return 0; -} - -static inline void nf_hook_ingress_init(struct net_device *dev) {} -#endif /* CONFIG_NETFILTER_INGRESS */ -#endif /* _NETFILTER_INGRESS_H_ */ diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h new file mode 100644 index 000000000000..a13774be2eb5 --- /dev/null +++ b/include/linux/netfilter_netdev.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NETFILTER_INGRESS_H_ +#define _NETFILTER_INGRESS_H_ + +#include +#include + +#ifdef CONFIG_NETFILTER_INGRESS +static inline bool nf_hook_ingress_active(const struct sk_buff *skb) +{ +#ifdef CONFIG_JUMP_LABEL + if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) + return false; +#endif + return rcu_access_pointer(skb->dev->nf_hooks_ingress); +} + +/* caller must hold rcu_read_lock */ +static inline int nf_hook_ingress(struct sk_buff *skb) +{ + struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress); + struct nf_hook_state state; + int ret; + + /* Must recheck the ingress hook head, in the event it became NULL + * after the check in nf_hook_ingress_active evaluated to true. + */ + if (unlikely(!e)) + return 0; + + nf_hook_state_init(&state, NF_NETDEV_INGRESS, + NFPROTO_NETDEV, skb->dev, NULL, NULL, + dev_net(skb->dev), NULL); + ret = nf_hook_slow(skb, &state, e, 0); + if (ret == 0) + return -1; + + return ret; +} + +static inline void nf_hook_ingress_init(struct net_device *dev) +{ + RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); +} +#else /* CONFIG_NETFILTER_INGRESS */ +static inline int nf_hook_ingress_active(struct sk_buff *skb) +{ + return 0; +} + +static inline int nf_hook_ingress(struct sk_buff *skb) +{ + return 0; +} + +static inline void nf_hook_ingress_init(struct net_device *dev) {} +#endif /* CONFIG_NETFILTER_INGRESS */ +#endif /* _NETFILTER_INGRESS_H_ */ -- cgit v1.2.3 From 17d20784223d52bf1671f984c9e8d5d9b8ea171b Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Fri, 8 Oct 2021 22:06:02 +0200 Subject: netfilter: Generalize ingress hook include file Prepare for addition of a netfilter egress hook by generalizing the ingress hook include file. No functional change intended. Signed-off-by: Lukas Wunner Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_netdev.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index a13774be2eb5..5812b0fb0278 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NETFILTER_INGRESS_H_ -#define _NETFILTER_INGRESS_H_ +#ifndef _NETFILTER_NETDEV_H_ +#define _NETFILTER_NETDEV_H_ #include #include @@ -38,10 +38,6 @@ static inline int nf_hook_ingress(struct sk_buff *skb) return ret; } -static inline void nf_hook_ingress_init(struct net_device *dev) -{ - RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); -} #else /* CONFIG_NETFILTER_INGRESS */ static inline int nf_hook_ingress_active(struct sk_buff *skb) { @@ -52,7 +48,13 @@ static inline int nf_hook_ingress(struct sk_buff *skb) { return 0; } - -static inline void nf_hook_ingress_init(struct net_device *dev) {} #endif /* CONFIG_NETFILTER_INGRESS */ -#endif /* _NETFILTER_INGRESS_H_ */ + +static inline void nf_hook_netdev_init(struct net_device *dev) +{ +#ifdef CONFIG_NETFILTER_INGRESS + RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); +#endif +} + +#endif /* _NETFILTER_NETDEV_H_ */ -- cgit v1.2.3 From 42df6e1d221dddc0f2acf2be37e68d553ad65f96 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Fri, 8 Oct 2021 22:06:03 +0200 Subject: netfilter: Introduce egress hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support classifying packets with netfilter on egress to satisfy user requirements such as: * outbound security policies for containers (Laura) * filtering and mangling intra-node Direct Server Return (DSR) traffic on a load balancer (Laura) * filtering locally generated traffic coming in through AF_PACKET, such as local ARP traffic generated for clustering purposes or DHCP (Laura; the AF_PACKET plumbing is contained in a follow-up commit) * L2 filtering from ingress and egress for AVB (Audio Video Bridging) and gPTP with nftables (Pablo) * in the future: in-kernel NAT64/NAT46 (Pablo) The egress hook introduced herein complements the ingress hook added by commit e687ad60af09 ("netfilter: add netfilter ingress hook after handle_ing() under unique static key"). A patch for nftables to hook up egress rules from user space has been submitted separately, so users may immediately take advantage of the feature. Alternatively or in addition to netfilter, packets can be classified with traffic control (tc). On ingress, packets are classified first by tc, then by netfilter. On egress, the order is reversed for symmetry. Conceptually, tc and netfilter can be thought of as layers, with netfilter layered above tc. Traffic control is capable of redirecting packets to another interface (man 8 tc-mirred). E.g., an ingress packet may be redirected from the host namespace to a container via a veth connection: tc ingress (host) -> tc egress (veth host) -> tc ingress (veth container) In this case, netfilter egress classifying is not performed when leaving the host namespace! That's because the packet is still on the tc layer. If tc redirects the packet to a physical interface in the host namespace such that it leaves the system, the packet is never subjected to netfilter egress classifying. That is only logical since it hasn't passed through netfilter ingress classifying either. Packets can alternatively be redirected at the netfilter layer using nft fwd. Such a packet *is* subjected to netfilter egress classifying since it has reached the netfilter layer. Internally, the skb->nf_skip_egress flag controls whether netfilter is invoked on egress by __dev_queue_xmit(). Because __dev_queue_xmit() may be called recursively by tunnel drivers such as vxlan, the flag is reverted to false after sch_handle_egress(). This ensures that netfilter is applied both on the overlay and underlying network. Interaction between tc and netfilter is possible by setting and querying skb->mark. If netfilter egress classifying is not enabled on any interface, it is patched out of the data path by way of a static_key and doesn't make a performance difference that is discernible from noise: Before: 1537 1538 1538 1537 1538 1537 Mb/sec After: 1536 1534 1539 1539 1539 1540 Mb/sec Before + tc accept: 1418 1418 1418 1419 1419 1418 Mb/sec After + tc accept: 1419 1424 1418 1419 1422 1420 Mb/sec Before + tc drop: 1620 1619 1619 1619 1620 1620 Mb/sec After + tc drop: 1616 1624 1625 1624 1622 1619 Mb/sec When netfilter egress classifying is enabled on at least one interface, a minimal performance penalty is incurred for every egress packet, even if the interface it's transmitted over doesn't have any netfilter egress rules configured. That is caused by checking dev->nf_hooks_egress against NULL. Measurements were performed on a Core i7-3615QM. Commands to reproduce: ip link add dev foo type dummy ip link set dev foo up modprobe pktgen echo "add_device foo" > /proc/net/pktgen/kpktgend_3 samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i foo -n 400000000 -m "11:11:11:11:11:11" -d 1.1.1.1 Accept all traffic with tc: tc qdisc add dev foo clsact tc filter add dev foo egress bpf da bytecode '1,6 0 0 0,' Drop all traffic with tc: tc qdisc add dev foo clsact tc filter add dev foo egress bpf da bytecode '1,6 0 0 2,' Apply this patch when measuring packet drops to avoid errors in dmesg: https://lore.kernel.org/netdev/a73dda33-57f4-95d8-ea51-ed483abd6a7a@iogearbox.net/ Signed-off-by: Lukas Wunner Cc: Laura GarcĂ­a LiĂ©bana Cc: John Fastabend Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Eric Dumazet Cc: Thomas Graf Signed-off-by: Pablo Neira Ayuso --- include/linux/netdevice.h | 4 ++ include/linux/netfilter_netdev.h | 86 ++++++++++++++++++++++++++++++++++++++++ include/linux/skbuff.h | 4 ++ 3 files changed, 94 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d79163208dfd..e9a48068f306 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1861,6 +1861,7 @@ enum netdev_ml_priv_type { * @xps_maps: XXX: need comments on this one * @miniq_egress: clsact qdisc specific data for * egress processing + * @nf_hooks_egress: netfilter hooks executed for egress packets * @qdisc_hash: qdisc hash table * @watchdog_timeo: Represents the timeout that is used by * the watchdog (see dev_watchdog()) @@ -2161,6 +2162,9 @@ struct net_device { #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc __rcu *miniq_egress; #endif +#ifdef CONFIG_NETFILTER_EGRESS + struct nf_hook_entries __rcu *nf_hooks_egress; +#endif #ifdef CONFIG_NET_SCHED DECLARE_HASHTABLE (qdisc_hash, 4); diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index 5812b0fb0278..b71b57a83bb4 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -50,11 +50,97 @@ static inline int nf_hook_ingress(struct sk_buff *skb) } #endif /* CONFIG_NETFILTER_INGRESS */ +#ifdef CONFIG_NETFILTER_EGRESS +static inline bool nf_hook_egress_active(void) +{ +#ifdef CONFIG_JUMP_LABEL + if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS])) + return false; +#endif + return true; +} + +/** + * nf_hook_egress - classify packets before transmission + * @skb: packet to be classified + * @rc: result code which shall be returned by __dev_queue_xmit() on failure + * @dev: netdev whose egress hooks shall be applied to @skb + * + * Returns @skb on success or %NULL if the packet was consumed or filtered. + * Caller must hold rcu_read_lock. + * + * On ingress, packets are classified first by tc, then by netfilter. + * On egress, the order is reversed for symmetry. Conceptually, tc and + * netfilter can be thought of as layers, with netfilter layered above tc: + * When tc redirects a packet to another interface, netfilter is not applied + * because the packet is on the tc layer. + * + * The nf_skip_egress flag controls whether netfilter is applied on egress. + * It is updated by __netif_receive_skb_core() and __dev_queue_xmit() when the + * packet passes through tc and netfilter. Because __dev_queue_xmit() may be + * called recursively by tunnel drivers such as vxlan, the flag is reverted to + * false after sch_handle_egress(). This ensures that netfilter is applied + * both on the overlay and underlying network. + */ +static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, + struct net_device *dev) +{ + struct nf_hook_entries *e; + struct nf_hook_state state; + int ret; + +#ifdef CONFIG_NETFILTER_SKIP_EGRESS + if (skb->nf_skip_egress) + return skb; +#endif + + e = rcu_dereference(dev->nf_hooks_egress); + if (!e) + return skb; + + nf_hook_state_init(&state, NF_NETDEV_EGRESS, + NFPROTO_NETDEV, dev, NULL, NULL, + dev_net(dev), NULL); + ret = nf_hook_slow(skb, &state, e, 0); + + if (ret == 1) { + return skb; + } else if (ret < 0) { + *rc = NET_XMIT_DROP; + return NULL; + } else { /* ret == 0 */ + *rc = NET_XMIT_SUCCESS; + return NULL; + } +} +#else /* CONFIG_NETFILTER_EGRESS */ +static inline bool nf_hook_egress_active(void) +{ + return false; +} + +static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, + struct net_device *dev) +{ + return skb; +} +#endif /* CONFIG_NETFILTER_EGRESS */ + +static inline void nf_skip_egress(struct sk_buff *skb, bool skip) +{ +#ifdef CONFIG_NETFILTER_SKIP_EGRESS + skb->nf_skip_egress = skip; +#endif +} + static inline void nf_hook_netdev_init(struct net_device *dev) { #ifdef CONFIG_NETFILTER_INGRESS RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); #endif +#ifdef CONFIG_NETFILTER_EGRESS + RCU_INIT_POINTER(dev->nf_hooks_egress, NULL); +#endif } #endif /* _NETFILTER_NETDEV_H_ */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 841e2f0f5240..cb96f1e6460c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -652,6 +652,7 @@ typedef unsigned char *sk_buff_data_t; * @tc_at_ingress: used within tc_classify to distinguish in/egress * @redirected: packet was redirected by packet classifier * @from_ingress: packet was redirected from the ingress path + * @nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h * @peeked: this packet has been seen already, so stats have been * done for it, don't do them again * @nf_trace: netfilter packet trace flag @@ -868,6 +869,9 @@ struct sk_buff { #ifdef CONFIG_NET_REDIRECT __u8 from_ingress:1; #endif +#ifdef CONFIG_NETFILTER_SKIP_EGRESS + __u8 nf_skip_egress:1; +#endif #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; #endif -- cgit v1.2.3 From 8844e01062ddd8196c4550df9803cc1835d123c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Oct 2021 17:15:10 +0200 Subject: netfilter: iptables: allow use of ipt_do_table as hookfn This is possible now that the xt_table structure is passed in via *priv. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv4/ip_tables.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 8d09bfe850dc..132b0e4a6d4d 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -63,9 +63,9 @@ struct ipt_error { } extern void *ipt_alloc_initial_table(const struct xt_table *); -extern unsigned int ipt_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct xt_table *table); +extern unsigned int ipt_do_table(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include -- cgit v1.2.3 From e8d225b6002673366abc2e40e30c991bdc8d62ca Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Oct 2021 17:15:12 +0200 Subject: netfilter: arp_tables: allow use of arpt_do_table as hookfn This is possible now that the xt_table structure is passed in via *priv. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_arp/arp_tables.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index 4f9a4b3c5892..a40aaf645fa4 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -54,9 +54,8 @@ int arpt_register_table(struct net *net, const struct xt_table *table, const struct nf_hook_ops *ops); void arpt_unregister_table(struct net *net, const char *name); void arpt_unregister_table_pre_exit(struct net *net, const char *name); -extern unsigned int arpt_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct xt_table *table); +extern unsigned int arpt_do_table(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include -- cgit v1.2.3 From 44b5990e7b463240e4c116c9e8670c67dad960cc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Oct 2021 17:15:13 +0200 Subject: netfilter: ip6tables: allow use of ip6t_do_table as hookfn This is possible now that the xt_table structure is passed via *priv. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6/ip6_tables.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 79e73fd7d965..8b8885a73c76 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -29,9 +29,8 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, const struct nf_hook_ops *ops); void ip6t_unregister_table_pre_exit(struct net *net, const char *name); void ip6t_unregister_table_exit(struct net *net, const char *name); -extern unsigned int ip6t_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct xt_table *table); +extern unsigned int ip6t_do_table(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include -- cgit v1.2.3 From f0d6764f7ddbf6d57302155b0f83eadb25ab0f0c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Oct 2021 17:15:14 +0200 Subject: netfilter: ebtables: allow use of ebt_do_table as hookfn This is possible now that the xt_table structure is passed via *priv. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 10a01978bc0d..a13296d6c7ce 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -112,9 +112,8 @@ extern int ebt_register_table(struct net *net, const struct nf_hook_ops *ops); extern void ebt_unregister_table(struct net *net, const char *tablename); void ebt_unregister_table_pre_exit(struct net *net, const char *tablename); -extern unsigned int ebt_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct ebt_table *table); +extern unsigned int ebt_do_table(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state); /* True if the hook mask denotes that the rule is in a base chain, * used in the check() functions */ -- cgit v1.2.3 From 8b017fbe0bbb98dd71fb4850f6b9cc0e136a26b8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Oct 2021 11:00:37 +0200 Subject: net: of: fix stub of_net helpers for CONFIG_NET=n Moving the of_net code from drivers/of/ to net/core means we no longer stub out the helpers when networking is disabled, which leads to a randconfig build failure with at least one ARM platform that calls this from non-networking code: arm-linux-gnueabi-ld: arch/arm/mach-mvebu/kirkwood.o: in function `kirkwood_dt_eth_fixup': kirkwood.c:(.init.text+0x54): undefined reference to `of_get_mac_address' Restore the way this worked before by changing that #ifdef check back to testing for both CONFIG_OF and CONFIG_NET. Fixes: e330fb14590c ("of: net: move of_net under net/") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20211014090055.2058949-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/of_net.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_net.h b/include/linux/of_net.h index 0797e2edb8c2..0484b613ca64 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -8,7 +8,7 @@ #include -#ifdef CONFIG_OF +#if defined(CONFIG_OF) && defined(CONFIG_NET) #include struct net_device; -- cgit v1.2.3 From 52f88693378a58094c538662ba652aff0253c4fe Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Tue, 12 Oct 2021 09:56:13 -0700 Subject: binder: use cred instead of task for selinux checks Since binder was integrated with selinux, it has passed 'struct task_struct' associated with the binder_proc to represent the source and target of transactions. The conversion of task to SID was then done in the hook implementations. It turns out that there are race conditions which can result in an incorrect security context being used. Fix by using the 'struct cred' saved during binder_open and pass it to the selinux subsystem. Cc: stable@vger.kernel.org # 5.14 (need backport for earlier stables) Fixes: 79af73079d75 ("Add security hooks to binder and implement the hooks for SELinux.") Suggested-by: Jann Horn Signed-off-by: Todd Kjos Acked-by: Casey Schaufler Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 14 +++++++------- include/linux/lsm_hooks.h | 14 +++++++------- include/linux/security.h | 28 ++++++++++++++-------------- 3 files changed, 28 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index b3c525353769..4c7ed0268ce3 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -26,13 +26,13 @@ * #undef LSM_HOOK * }; */ -LSM_HOOK(int, 0, binder_set_context_mgr, struct task_struct *mgr) -LSM_HOOK(int, 0, binder_transaction, struct task_struct *from, - struct task_struct *to) -LSM_HOOK(int, 0, binder_transfer_binder, struct task_struct *from, - struct task_struct *to) -LSM_HOOK(int, 0, binder_transfer_file, struct task_struct *from, - struct task_struct *to, struct file *file) +LSM_HOOK(int, 0, binder_set_context_mgr, const struct cred *mgr) +LSM_HOOK(int, 0, binder_transaction, const struct cred *from, + const struct cred *to) +LSM_HOOK(int, 0, binder_transfer_binder, const struct cred *from, + const struct cred *to) +LSM_HOOK(int, 0, binder_transfer_file, const struct cred *from, + const struct cred *to, struct file *file) LSM_HOOK(int, 0, ptrace_access_check, struct task_struct *child, unsigned int mode) LSM_HOOK(int, 0, ptrace_traceme, struct task_struct *parent) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 0eb0ae95c4c4..528554e9b90c 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1313,22 +1313,22 @@ * * @binder_set_context_mgr: * Check whether @mgr is allowed to be the binder context manager. - * @mgr contains the task_struct for the task being registered. + * @mgr contains the struct cred for the current binder process. * Return 0 if permission is granted. * @binder_transaction: * Check whether @from is allowed to invoke a binder transaction call * to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. + * @from contains the struct cred for the sending process. + * @to contains the struct cred for the receiving process. * @binder_transfer_binder: * Check whether @from is allowed to transfer a binder reference to @to. - * @from contains the task_struct for the sending task. - * @to contains the task_struct for the receiving task. + * @from contains the struct cred for the sending process. + * @to contains the struct cred for the receiving process. * @binder_transfer_file: * Check whether @from is allowed to transfer @file to @to. - * @from contains the task_struct for the sending task. + * @from contains the struct cred for the sending process. * @file contains the struct file being transferred. - * @to contains the task_struct for the receiving task. + * @to contains the struct cred for the receiving process. * * @ptrace_access_check: * Check permission before allowing the current process to trace the diff --git a/include/linux/security.h b/include/linux/security.h index 7979b9629a42..9be72166e859 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -258,13 +258,13 @@ extern int security_init(void); extern int early_security_init(void); /* Security operations */ -int security_binder_set_context_mgr(struct task_struct *mgr); -int security_binder_transaction(struct task_struct *from, - struct task_struct *to); -int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to); -int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, struct file *file); +int security_binder_set_context_mgr(const struct cred *mgr); +int security_binder_transaction(const struct cred *from, + const struct cred *to); +int security_binder_transfer_binder(const struct cred *from, + const struct cred *to); +int security_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file); int security_ptrace_access_check(struct task_struct *child, unsigned int mode); int security_ptrace_traceme(struct task_struct *parent); int security_capget(struct task_struct *target, @@ -508,25 +508,25 @@ static inline int early_security_init(void) return 0; } -static inline int security_binder_set_context_mgr(struct task_struct *mgr) +static inline int security_binder_set_context_mgr(const struct cred *mgr) { return 0; } -static inline int security_binder_transaction(struct task_struct *from, - struct task_struct *to) +static inline int security_binder_transaction(const struct cred *from, + const struct cred *to) { return 0; } -static inline int security_binder_transfer_binder(struct task_struct *from, - struct task_struct *to) +static inline int security_binder_transfer_binder(const struct cred *from, + const struct cred *to) { return 0; } -static inline int security_binder_transfer_file(struct task_struct *from, - struct task_struct *to, +static inline int security_binder_transfer_file(const struct cred *from, + const struct cred *to, struct file *file) { return 0; -- cgit v1.2.3 From 4d5b5539742d2554591751b4248b0204d20dcc9d Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Tue, 12 Oct 2021 09:56:14 -0700 Subject: binder: use cred instead of task for getsecid Use the 'struct cred' saved at binder_open() to lookup the security ID via security_cred_getsecid(). This ensures that the security context that opened binder is the one used to generate the secctx. Cc: stable@vger.kernel.org # 5.4+ Fixes: ec74136ded79 ("binder: create node flag to request sender's security context") Signed-off-by: Todd Kjos Suggested-by: Stephen Smalley Reported-by: kernel test robot Acked-by: Casey Schaufler Signed-off-by: Paul Moore --- include/linux/security.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 9be72166e859..cc6d39358336 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1041,6 +1041,11 @@ static inline void security_transfer_creds(struct cred *new, { } +static inline void security_cred_getsecid(const struct cred *c, u32 *secid) +{ + *secid = 0; +} + static inline int security_kernel_act_as(struct cred *cred, u32 secid) { return 0; -- cgit v1.2.3 From c072c4ef7ef09e1d6470c48cf52570487589b76a Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Thu, 14 Oct 2021 16:35:06 +0300 Subject: soc: samsung: exynos-chipid: Pass revision reg offsets Old Exynos SoCs have both Product ID and Revision ID in one single register, while new SoCs tend to have two separate registers for those IDs. Implement handling of both cases by passing Revision ID register offsets in driver data. Previously existing macros for Exynos4210 (removed in this patch) were incorrect: #define EXYNOS_SUBREV_MASK (0xf << 4) #define EXYNOS_MAINREV_MASK (0xf << 0) Actual format of PRO_ID register in Exynos4210 (offset 0x0): [31:12] Product ID [9:8] Package information [7:4] Main Revision Number [3:0] Sub Revision Number This patch doesn't change the behavior on existing platforms, so '/sys/devices/soc0/revision' will show the same string as before. Signed-off-by: Sam Protsenko Tested-by: Henrik Grimler Link: https://lore.kernel.org/r/20211014133508.1210-1-semen.protsenko@linaro.org Signed-off-by: Krzysztof Kozlowski --- include/linux/soc/samsung/exynos-chipid.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/samsung/exynos-chipid.h b/include/linux/soc/samsung/exynos-chipid.h index 8bca6763f99c..62f0e2531068 100644 --- a/include/linux/soc/samsung/exynos-chipid.h +++ b/include/linux/soc/samsung/exynos-chipid.h @@ -9,10 +9,8 @@ #define __LINUX_SOC_EXYNOS_CHIPID_H #define EXYNOS_CHIPID_REG_PRO_ID 0x00 -#define EXYNOS_SUBREV_MASK (0xf << 4) -#define EXYNOS_MAINREV_MASK (0xf << 0) -#define EXYNOS_REV_MASK (EXYNOS_SUBREV_MASK | \ - EXYNOS_MAINREV_MASK) +#define EXYNOS_REV_PART_MASK 0xf +#define EXYNOS_REV_PART_SHIFT 4 #define EXYNOS_MASK 0xfffff000 #define EXYNOS_CHIPID_REG_PKG_ID 0x04 -- cgit v1.2.3 From 42a20f86dc19f9282d974df0ba4d226c865ab9dd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 29 Sep 2021 15:02:14 -0700 Subject: sched: Add wrapper for get_wchan() to keep task blocked Having a stable wchan means the process must be blocked and for it to stay that way while performing stack unwinding. Suggested-by: Peter Zijlstra Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Acked-by: Geert Uytterhoeven Acked-by: Russell King (Oracle) [arm] Tested-by: Mark Rutland [arm64] Link: https://lkml.kernel.org/r/20211008111626.332092234@infradead.org --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 343603f77f8b..5f8db54226af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2139,6 +2139,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ extern bool sched_task_on_rq(struct task_struct *p); +extern unsigned long get_wchan(struct task_struct *p); /* * In order to reduce various lock holder preemption latencies provide an -- cgit v1.2.3 From c5e22feffdd736cb02b98b0f5b375c8ebc858dd4 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 24 Sep 2021 20:51:02 +1200 Subject: topology: Represent clusters of CPUs within a die Both ACPI and DT provide the ability to describe additional layers of topology between that of individual cores and higher level constructs such as the level at which the last level cache is shared. In ACPI this can be represented in PPTT as a Processor Hierarchy Node Structure [1] that is the parent of the CPU cores and in turn has a parent Processor Hierarchy Nodes Structure representing a higher level of topology. For example Kunpeng 920 has 6 or 8 clusters in each NUMA node, and each cluster has 4 cpus. All clusters share L3 cache data, but each cluster has local L3 tag. On the other hand, each clusters will share some internal system bus. +-----------------------------------+ +---------+ | +------+ +------+ +--------------------------+ | | | CPU0 | | cpu1 | | +-----------+ | | | +------+ +------+ | | | | | | +----+ L3 | | | | +------+ +------+ cluster | | tag | | | | | CPU2 | | CPU3 | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | | +-----------------------------------+ | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | | | L3 | | | | +------+ +------+ +----+ tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | L3 | | data | +-----------------------------------+ | | | +------+ +------+ | +-----------+ | | | | | | | | | | | | | +------+ +------+ +----+ L3 | | | | | | tag | | | | +------+ +------+ | | | | | | | | | | | +-----------+ | | | +------+ +------+ +--------------------------+ | +-----------------------------------| | | +-----------------------------------| | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | +----+ L3 | | | | +------+ +------+ | | tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | | +-----------------------------------+ | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | | | L3 | | | | +------+ +------+ +---+ tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | | +-----------------------------------+ | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | | | L3 | | | | +------+ +------+ +--+ tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | +---------+ +-----------------------------------+ That means spreading tasks among clusters will bring more bandwidth while packing tasks within one cluster will lead to smaller cache synchronization latency. So both kernel and userspace will have a chance to leverage this topology to deploy tasks accordingly to achieve either smaller cache latency within one cluster or an even distribution of load among clusters for higher throughput. This patch exposes cluster topology to both kernel and userspace. Libraried like hwloc will know cluster by cluster_cpus and related sysfs attributes. PoC of HWLOC support at [2]. Note this patch only handle the ACPI case. Special consideration is needed for SMT processors, where it is necessary to move 2 levels up the hierarchy from the leaf nodes (thus skipping the processor core level). Note that arm64 / ACPI does not provide any means of identifying a die level in the topology but that may be unrelate to the cluster level. [1] ACPI Specification 6.3 - section 5.2.29.1 processor hierarchy node structure (Type 0) [2] https://github.com/hisilicon/hwloc/tree/linux-cluster Signed-off-by: Jonathan Cameron Signed-off-by: Tian Tao Signed-off-by: Barry Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210924085104.44806-2-21cnbao@gmail.com --- include/linux/acpi.h | 5 +++++ include/linux/arch_topology.h | 5 +++++ include/linux/topology.h | 6 ++++++ 3 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 974d497a897d..fbc2146050a4 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1353,6 +1353,7 @@ static inline int lpit_read_residency_count_address(u64 *address) #ifdef CONFIG_ACPI_PPTT int acpi_pptt_cpu_is_thread(unsigned int cpu); int find_acpi_cpu_topology(unsigned int cpu, int level); +int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); int find_acpi_cpu_cache_topology(unsigned int cpu, int level); @@ -1365,6 +1366,10 @@ static inline int find_acpi_cpu_topology(unsigned int cpu, int level) { return -EINVAL; } +static inline int find_acpi_cpu_topology_cluster(unsigned int cpu) +{ + return -EINVAL; +} static inline int find_acpi_cpu_topology_package(unsigned int cpu) { return -EINVAL; diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index f180240dc95f..b97cea83b25e 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -62,10 +62,12 @@ void topology_set_thermal_pressure(const struct cpumask *cpus, struct cpu_topology { int thread_id; int core_id; + int cluster_id; int package_id; int llc_id; cpumask_t thread_sibling; cpumask_t core_sibling; + cpumask_t cluster_sibling; cpumask_t llc_sibling; }; @@ -73,13 +75,16 @@ struct cpu_topology { extern struct cpu_topology cpu_topology[NR_CPUS]; #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id) +#define topology_cluster_id(cpu) (cpu_topology[cpu].cluster_id) #define topology_core_id(cpu) (cpu_topology[cpu].core_id) #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) +#define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling) void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +const struct cpumask *cpu_clustergroup_mask(int cpu); void update_siblings_masks(unsigned int cpu); void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); diff --git a/include/linux/topology.h b/include/linux/topology.h index 7634cd737061..80d27d717631 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -186,6 +186,9 @@ static inline int cpu_to_mem(int cpu) #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif +#ifndef topology_cluster_id +#define topology_cluster_id(cpu) ((void)(cpu), -1) +#endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif @@ -195,6 +198,9 @@ static inline int cpu_to_mem(int cpu) #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif +#ifndef topology_cluster_cpumask +#define topology_cluster_cpumask(cpu) cpumask_of(cpu) +#endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif -- cgit v1.2.3 From 778c558f49a2cb3dc7b18a80ff515e82aa813627 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 24 Sep 2021 20:51:03 +1200 Subject: sched: Add cluster scheduler level in core and related Kconfig for ARM64 This patch adds scheduler level for clusters and automatically enables the load balance among clusters. It will directly benefit a lot of workload which loves more resources such as memory bandwidth, caches. Testing has widely been done in two different hardware configurations of Kunpeng920: 24 cores in one NUMA(6 clusters in each NUMA node); 32 cores in one NUMA(8 clusters in each NUMA node) Workload is running on either one NUMA node or four NUMA nodes, thus, this can estimate the effect of cluster spreading w/ and w/o NUMA load balance. * Stream benchmark: 4threads stream (on 1NUMA * 24cores = 24cores) stream stream w/o patch w/ patch MB/sec copy 29929.64 ( 0.00%) 32932.68 ( 10.03%) MB/sec scale 29861.10 ( 0.00%) 32710.58 ( 9.54%) MB/sec add 27034.42 ( 0.00%) 32400.68 ( 19.85%) MB/sec triad 27225.26 ( 0.00%) 31965.36 ( 17.41%) 6threads stream (on 1NUMA * 24cores = 24cores) stream stream w/o patch w/ patch MB/sec copy 40330.24 ( 0.00%) 42377.68 ( 5.08%) MB/sec scale 40196.42 ( 0.00%) 42197.90 ( 4.98%) MB/sec add 37427.00 ( 0.00%) 41960.78 ( 12.11%) MB/sec triad 37841.36 ( 0.00%) 42513.64 ( 12.35%) 12threads stream (on 1NUMA * 24cores = 24cores) stream stream w/o patch w/ patch MB/sec copy 52639.82 ( 0.00%) 53818.04 ( 2.24%) MB/sec scale 52350.30 ( 0.00%) 53253.38 ( 1.73%) MB/sec add 53607.68 ( 0.00%) 55198.82 ( 2.97%) MB/sec triad 54776.66 ( 0.00%) 56360.40 ( 2.89%) Thus, it could help memory-bound workload especially under medium load. Similar improvement is also seen in lkp-pbzip2: * lkp-pbzip2 benchmark 2-96 threads (on 4NUMA * 24cores = 96cores) lkp-pbzip2 lkp-pbzip2 w/o patch w/ patch Hmean tput-2 11062841.57 ( 0.00%) 11341817.51 * 2.52%* Hmean tput-5 26815503.70 ( 0.00%) 27412872.65 * 2.23%* Hmean tput-8 41873782.21 ( 0.00%) 43326212.92 * 3.47%* Hmean tput-12 61875980.48 ( 0.00%) 64578337.51 * 4.37%* Hmean tput-21 105814963.07 ( 0.00%) 111381851.01 * 5.26%* Hmean tput-30 150349470.98 ( 0.00%) 156507070.73 * 4.10%* Hmean tput-48 237195937.69 ( 0.00%) 242353597.17 * 2.17%* Hmean tput-79 360252509.37 ( 0.00%) 362635169.23 * 0.66%* Hmean tput-96 394571737.90 ( 0.00%) 400952978.48 * 1.62%* 2-24 threads (on 1NUMA * 24cores = 24cores) lkp-pbzip2 lkp-pbzip2 w/o patch w/ patch Hmean tput-2 11071705.49 ( 0.00%) 11296869.10 * 2.03%* Hmean tput-4 20782165.19 ( 0.00%) 21949232.15 * 5.62%* Hmean tput-6 30489565.14 ( 0.00%) 33023026.96 * 8.31%* Hmean tput-8 40376495.80 ( 0.00%) 42779286.27 * 5.95%* Hmean tput-12 61264033.85 ( 0.00%) 62995632.78 * 2.83%* Hmean tput-18 86697139.39 ( 0.00%) 86461545.74 ( -0.27%) Hmean tput-24 104854637.04 ( 0.00%) 104522649.46 * -0.32%* In the case of 6 threads and 8 threads, we see the greatest performance improvement. Similar improvement can be seen on lkp-pixz though the improvement is smaller: * lkp-pixz benchmark 2-24 threads lkp-pixz (on 1NUMA * 24cores = 24cores) lkp-pixz lkp-pixz w/o patch w/ patch Hmean tput-2 6486981.16 ( 0.00%) 6561515.98 * 1.15%* Hmean tput-4 11645766.38 ( 0.00%) 11614628.43 ( -0.27%) Hmean tput-6 15429943.96 ( 0.00%) 15957350.76 * 3.42%* Hmean tput-8 19974087.63 ( 0.00%) 20413746.98 * 2.20%* Hmean tput-12 28172068.18 ( 0.00%) 28751997.06 * 2.06%* Hmean tput-18 39413409.54 ( 0.00%) 39896830.55 * 1.23%* Hmean tput-24 49101815.85 ( 0.00%) 49418141.47 * 0.64%* * SPECrate benchmark 4,8,16 copies mcf_r(on 1NUMA * 32cores = 32cores) Base Base Run Time Rate ------- --------- 4 Copies w/o 580 (w/ 570) w/o 11.1 (w/ 11.3) 8 Copies w/o 647 (w/ 605) w/o 20.0 (w/ 21.4, +7%) 16 Copies w/o 844 (w/ 844) w/o 30.6 (w/ 30.6) 32 Copies(on 4NUMA * 32 cores = 128cores) [w/o patch] Base Base Base Benchmarks Copies Run Time Rate --------------- ------- --------- --------- 500.perlbench_r 32 584 87.2 * 502.gcc_r 32 503 90.2 * 505.mcf_r 32 745 69.4 * 520.omnetpp_r 32 1031 40.7 * 523.xalancbmk_r 32 597 56.6 * 525.x264_r 1 -- CE 531.deepsjeng_r 32 336 109 * 541.leela_r 32 556 95.4 * 548.exchange2_r 32 513 163 * 557.xz_r 32 530 65.2 * Est. SPECrate2017_int_base 80.3 [w/ patch] Base Base Base Benchmarks Copies Run Time Rate --------------- ------- --------- --------- 500.perlbench_r 32 580 87.8 (+0.688%) * 502.gcc_r 32 477 95.1 (+5.432%) * 505.mcf_r 32 644 80.3 (+13.574%) * 520.omnetpp_r 32 942 44.6 (+9.58%) * 523.xalancbmk_r 32 560 60.4 (+6.714%%) * 525.x264_r 1 -- CE 531.deepsjeng_r 32 337 109 (+0.000%) * 541.leela_r 32 554 95.6 (+0.210%) * 548.exchange2_r 32 515 163 (+0.000%) * 557.xz_r 32 524 66.0 (+1.227%) * Est. SPECrate2017_int_base 83.7 (+4.062%) On the other hand, it is slightly helpful to CPU-bound tasks like kernbench: * 24-96 threads kernbench (on 4NUMA * 24cores = 96cores) kernbench kernbench w/o cluster w/ cluster Min user-24 12054.67 ( 0.00%) 12024.19 ( 0.25%) Min syst-24 1751.51 ( 0.00%) 1731.68 ( 1.13%) Min elsp-24 600.46 ( 0.00%) 598.64 ( 0.30%) Min user-48 12361.93 ( 0.00%) 12315.32 ( 0.38%) Min syst-48 1917.66 ( 0.00%) 1892.73 ( 1.30%) Min elsp-48 333.96 ( 0.00%) 332.57 ( 0.42%) Min user-96 12922.40 ( 0.00%) 12921.17 ( 0.01%) Min syst-96 2143.94 ( 0.00%) 2110.39 ( 1.56%) Min elsp-96 211.22 ( 0.00%) 210.47 ( 0.36%) Amean user-24 12063.99 ( 0.00%) 12030.78 * 0.28%* Amean syst-24 1755.20 ( 0.00%) 1735.53 * 1.12%* Amean elsp-24 601.60 ( 0.00%) 600.19 ( 0.23%) Amean user-48 12362.62 ( 0.00%) 12315.56 * 0.38%* Amean syst-48 1921.59 ( 0.00%) 1894.95 * 1.39%* Amean elsp-48 334.10 ( 0.00%) 332.82 * 0.38%* Amean user-96 12925.27 ( 0.00%) 12922.63 ( 0.02%) Amean syst-96 2146.66 ( 0.00%) 2122.20 * 1.14%* Amean elsp-96 211.96 ( 0.00%) 211.79 ( 0.08%) Note this patch isn't an universal win, it might hurt those workload which can benefit from packing. Though tasks which want to take advantages of lower communication latency of one cluster won't necessarily been packed in one cluster while kernel is not aware of clusters, they have some chance to be randomly packed. But this patch will make them more likely spread. Signed-off-by: Barry Song Tested-by: Yicong Yang Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched/topology.h | 7 +++++++ include/linux/topology.h | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 8f0f778b7c91..2f9166f6dec8 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -42,6 +42,13 @@ static inline int cpu_smt_flags(void) } #endif +#ifdef CONFIG_SCHED_CLUSTER +static inline int cpu_cluster_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} +#endif + #ifdef CONFIG_SCHED_MC static inline int cpu_core_flags(void) { diff --git a/include/linux/topology.h b/include/linux/topology.h index 80d27d717631..0b3704ad13c8 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -212,6 +212,13 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) } #endif +#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) +static inline const struct cpumask *cpu_cluster_mask(int cpu) +{ + return topology_cluster_cpumask(cpu); +} +#endif + static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); -- cgit v1.2.3 From 810979682ccc98dbd83f341c18a2e556c30a7164 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 6 Oct 2021 13:18:50 +0200 Subject: irq_work: Allow irq_work_sync() to sleep if irq_work() no IRQ support. irq_work() triggers instantly an interrupt if supported by the architecture. Otherwise the work will be processed on the next timer tick. In worst case irq_work_sync() could spin up to a jiffy. irq_work_sync() is usually used in tear down context which is fully preemptible. Based on review irq_work_sync() is invoked from preemptible context and there is one waiter at a time. This qualifies it to use rcuwait for synchronisation. Let irq_work_sync() synchronize with rcuwait if the architecture processes irqwork via the timer tick. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211006111852.1514359-3-bigeasy@linutronix.de --- include/linux/irq_work.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index ec2a47a81e42..b48955e9c920 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -3,6 +3,7 @@ #define _LINUX_IRQ_WORK_H #include +#include /* * An entry can be in one of four states: @@ -16,11 +17,13 @@ struct irq_work { struct __call_single_node node; void (*func)(struct irq_work *); + struct rcuwait irqwait; }; #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ .node = { .u_flags = (_flags), }, \ .func = (_func), \ + .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ } #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) -- cgit v1.2.3 From 09089db79859cbccccd8df95b034f36f7027efa6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 6 Oct 2021 13:18:52 +0200 Subject: irq_work: Also rcuwait for !IRQ_WORK_HARD_IRQ on PREEMPT_RT On PREEMPT_RT most items are processed as LAZY via softirq context. Avoid to spin-wait for them because irq_work_sync() could have higher priority and not allow the irq-work to be completed. Wait additionally for !IRQ_WORK_HARD_IRQ irq_work items on PREEMPT_RT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211006111852.1514359-5-bigeasy@linutronix.de --- include/linux/irq_work.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index b48955e9c920..8cd11a223260 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -49,6 +49,11 @@ static inline bool irq_work_is_busy(struct irq_work *work) return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; } +static inline bool irq_work_is_hard(struct irq_work *work) +{ + return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ; +} + bool irq_work_queue(struct irq_work *work); bool irq_work_queue_on(struct irq_work *work, int cpu); -- cgit v1.2.3 From 8b8ff8cc3b8155c18162e8b1f70e1230db176862 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 7 Sep 2021 19:39:01 +0300 Subject: perf/x86: Add new event for AUX output counter index PEBS-via-PT records contain a mask of applicable counters. To identify which event belongs to which counter, a side-band event is needed. Until now, there has been no side-band event, and consequently users were limited to using a single event. Add such a side-band event. Note the event is optimised to output only when the counter index changes for an event. That works only so long as all PEBS-via-PT events are scheduled together, which they are for a recording session because they are in a single group. Also no attribute bit is used to select the new event, so a new kernel is not compatible with older perf tools. The assumption being that PEBS-via-PT is sufficiently esoteric that users will not be troubled by this. Signed-off-by: Adrian Hunter Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210907163903.11820-2-adrian.hunter@intel.com --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2d510ad750ed..126b3a314f3a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1397,6 +1397,7 @@ perf_event_addr_filters(struct perf_event *event) } extern void perf_event_addr_filters_sync(struct perf_event *event); +extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, -- cgit v1.2.3 From d00e60ee54b12de945b8493cf18c1ada9e422514 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 13 Oct 2021 17:19:20 +0800 Subject: page_pool: disable dma mapping support for 32-bit arch with 64-bit DMA As the 32-bit arch with 64-bit DMA seems to rare those days, and page pool might carry a lot of code and complexity for systems that possibly. So disable dma mapping support for such systems, if drivers really want to work on such systems, they have to implement their own DMA-mapping fallback tracking outside page_pool. Reviewed-by: Ilias Apalodimas Signed-off-by: Yunsheng Lin Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/mm_types.h | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..436e0946d691 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -104,18 +104,7 @@ struct page { struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; - union { - /** - * dma_addr_upper: might require a 64-bit - * value on 32-bit architectures. - */ - unsigned long dma_addr_upper; - /** - * For frag page support, not supported in - * 32-bit architectures with 64-bit DMA. - */ - atomic_long_t pp_frag_count; - }; + atomic_long_t pp_frag_count; }; struct { /* slab, slob and slub */ union { -- cgit v1.2.3 From 9028b2463c1ea96f51c3ba53e2479346019ff6ad Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Thu, 25 Mar 2021 15:08:44 +0100 Subject: tee: add sec_world_id to struct tee_shm Adds sec_world_id to struct tee_shm which describes a shared memory object. sec_world_id can be used by a driver to store an id assigned by secure world. Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- include/linux/tee_drv.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 3ebfea0781f1..a1f03461369b 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -197,7 +197,11 @@ int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method, * @num_pages: number of locked pages * @dmabuf: dmabuf used to for exporting to user space * @flags: defined by TEE_SHM_* in tee_drv.h - * @id: unique id of a shared memory object on this device + * @id: unique id of a shared memory object on this device, shared + * with user space + * @sec_world_id: + * secure world assigned id of this shared memory object, not + * used by all drivers * * This pool is only supposed to be accessed directly from the TEE * subsystem and from drivers that implements their own shm pool manager. @@ -213,6 +217,7 @@ struct tee_shm { struct dma_buf *dmabuf; u32 flags; int id; + u64 sec_world_id; }; /** -- cgit v1.2.3 From 9955548919c47a6987b40d90a30fd56bbc043e7b Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Fri, 1 Oct 2021 12:12:30 +0200 Subject: remoteproc: Remove vdev_to_rvdev and vdev_to_rproc from remoteproc API These both functions are only used by the remoteproc_virtio. There is no reason to expose them in the API. Move the functions in remoteproc_virtio.c Signed-off-by: Arnaud Pouliquen Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20211001101234.4247-4-arnaud.pouliquen@foss.st.com --- include/linux/remoteproc.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 83c09ac36b13..e0600e1e5c17 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -684,18 +684,6 @@ int rproc_coredump_add_custom_segment(struct rproc *rproc, void *priv); int rproc_coredump_set_elf_info(struct rproc *rproc, u8 class, u16 machine); -static inline struct rproc_vdev *vdev_to_rvdev(struct virtio_device *vdev) -{ - return container_of(vdev->dev.parent, struct rproc_vdev, dev); -} - -static inline struct rproc *vdev_to_rproc(struct virtio_device *vdev) -{ - struct rproc_vdev *rvdev = vdev_to_rvdev(vdev); - - return rvdev->rproc; -} - void rproc_add_subdev(struct rproc *rproc, struct rproc_subdev *subdev); void rproc_remove_subdev(struct rproc *rproc, struct rproc_subdev *subdev); -- cgit v1.2.3 From 789c1093f02c436b320d78a739f9610c8271cb73 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 11 Oct 2021 21:21:14 +0800 Subject: rtc: class: don't call cdev_device_del() when cdev_device_add() failed I got a null-ptr-deref report when doing fault injection test: general protection fault, probably for non-canonical address 0xdffffc0000000022: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000110-0x0000000000000117] RIP: 0010:device_del+0x132/0xdc0 Call Trace: cdev_device_del+0x1a/0x80 devm_rtc_unregister_device+0x37/0x80 release_nodes+0xc3/0x3b0 If cdev_device_add() fails, 'dev->p' is not set, it causes null-ptr-deref when calling cdev_device_del(). Registering character device is optional, we don't return error code here, so introduce a new flag 'RTC_NO_CDEV' to indicate if it has character device, cdev_device_del() is called when this bit is not set. Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211011132114.3663509-1-yangyingliang@huawei.com --- include/linux/rtc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index bd611e26291d..354e0843ab17 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -80,6 +80,7 @@ struct rtc_timer { /* flags */ #define RTC_DEV_BUSY 0 +#define RTC_NO_CDEV 1 struct rtc_device { struct device dev; -- cgit v1.2.3 From 4b2c5fa9c9902ce34ecea6711558d9af96351b31 Mon Sep 17 00:00:00 2001 From: Amir Tzin Date: Wed, 21 Jul 2021 16:14:12 +0300 Subject: net/mlx5: Add layout to support default timeouts register Add needed structures and defines for DTOR (default timeouts register). This will be used to get timeouts values from FW instead of hard coded values in the driver code thus enabling support for slower devices which need longer timeouts. Signed-off-by: Amir Tzin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 4 +++- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 37 ++++++++++++++++++++++++++++++++++++- 3 files changed, 40 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 66eaf0aa7f69..109cc8106d16 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -577,7 +577,9 @@ struct mlx5_init_seg { __be32 rsvd1[120]; __be32 initializing; struct health_buffer health; - __be32 rsvd2[880]; + __be32 rsvd2[878]; + __be32 cmd_exec_to; + __be32 cmd_q_init_to; __be32 internal_timer_h; __be32 internal_timer_l; __be32 rsvd3[2]; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 0ca719c00824..ccbd87fbd3bf 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -156,6 +156,7 @@ enum { MLX5_REG_MIRC = 0x9162, MLX5_REG_SBCAM = 0xB01F, MLX5_REG_RESOURCE_DUMP = 0xC000, + MLX5_REG_DTOR = 0xC00E, }; enum mlx5_qpts_trust_state { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 993204a6c1a1..b8bff5109656 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1306,7 +1306,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 vhca_resource_manager[0x1]; u8 hca_cap_2[0x1]; - u8 reserved_at_21[0x2]; + u8 reserved_at_21[0x1]; + u8 dtor[0x1]; u8 event_on_vhca_state_teardown_request[0x1]; u8 event_on_vhca_state_in_use[0x1]; u8 event_on_vhca_state_active[0x1]; @@ -2807,6 +2808,40 @@ struct mlx5_ifc_dropped_packet_logged_bits { u8 reserved_at_0[0xe0]; }; +struct mlx5_ifc_default_timeout_bits { + u8 to_multiplier[0x3]; + u8 reserved_at_3[0x9]; + u8 to_value[0x14]; +}; + +struct mlx5_ifc_dtor_reg_bits { + u8 reserved_at_0[0x20]; + + struct mlx5_ifc_default_timeout_bits pcie_toggle_to; + + u8 reserved_at_40[0x60]; + + struct mlx5_ifc_default_timeout_bits health_poll_to; + + struct mlx5_ifc_default_timeout_bits full_crdump_to; + + struct mlx5_ifc_default_timeout_bits fw_reset_to; + + struct mlx5_ifc_default_timeout_bits flush_on_err_to; + + struct mlx5_ifc_default_timeout_bits pci_sync_update_to; + + struct mlx5_ifc_default_timeout_bits tear_down_to; + + struct mlx5_ifc_default_timeout_bits fsm_reactivate_to; + + struct mlx5_ifc_default_timeout_bits reclaim_pages_to; + + struct mlx5_ifc_default_timeout_bits reclaim_vfs_pages_to; + + u8 reserved_at_1c0[0x40]; +}; + enum { MLX5_CQ_ERROR_SYNDROME_CQ_OVERRUN = 0x1, MLX5_CQ_ERROR_SYNDROME_CQ_ACCESS_VIOLATION_ERROR = 0x2, -- cgit v1.2.3 From 5945e1adeab527ec96c75a786213c146d4d482a4 Mon Sep 17 00:00:00 2001 From: Amir Tzin Date: Thu, 7 Oct 2021 18:00:27 +0300 Subject: net/mlx5: Read timeout values from init segment Replace hard coded timeouts with values stored in firmware's init segment. Timeouts are read from init segment during driver load. If init segment timeouts are not supported then fallback to hard coded defaults instead. Also move pre initialization timeouts which cannot be read from firmware to the new mechanism. Signed-off-by: Amir Tzin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ccbd87fbd3bf..fb06e8870aee 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -66,10 +66,6 @@ enum { }; enum { - /* one minute for the sake of bringup. Generally, commands must always - * complete and we may need to increase this timeout value - */ - MLX5_CMD_TIMEOUT_MSEC = 60 * 1000, MLX5_CMD_WQ_MAX_NAME = 32, }; @@ -755,6 +751,7 @@ struct mlx5_core_dev { u32 qcam[MLX5_ST_SZ_DW(qcam_reg)]; u8 embedded_cpu; } caps; + struct mlx5_timeouts *timeouts; u64 sys_image_guid; phys_addr_t iseg_base; struct mlx5_init_seg __iomem *iseg; -- cgit v1.2.3 From fbfa97b4d79f26042f188b84959065213e9d3e99 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 18 Aug 2021 13:21:30 +0300 Subject: net/mlx5: Disable roce at HCA level Currently, when a user disables roce via the devlink param, this change isn't passed down to the device. If device allows disabling RoCE at device level, make use of it. This instructs the device to skip memory allocations related to RoCE functionality which otherwise is done by the device. Signed-off-by: Shay Drory Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 9 +++++---- include/linux/mlx5/mlx5_ifc.h | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fb06e8870aee..7c8b5f06c2cd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1251,11 +1251,12 @@ static inline bool mlx5_is_roce_init_enabled(struct mlx5_core_dev *dev) { struct devlink *devlink = priv_to_devlink(dev); union devlink_param_value val; + int err; - devlink_param_driverinit_value_get(devlink, - DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, - &val); - return val.vbool; + err = devlink_param_driverinit_value_get(devlink, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + &val); + return err ? MLX5_CAP_GEN(dev, roce) : val.vbool; } #endif /* MLX5_DRIVER_H */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index b8bff5109656..c614ad1da44d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1588,7 +1588,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_tis_per_sq[0x5]; u8 ext_stride_num_range[0x1]; - u8 reserved_at_3a1[0x2]; + u8 roce_rw_supported[0x1]; + u8 reserved_at_3a2[0x1]; u8 log_max_stride_sz_rq[0x5]; u8 reserved_at_3a8[0x3]; u8 log_min_stride_sz_rq[0x5]; -- cgit v1.2.3 From 2ec16ddde1fa31a83aee04320b248e94348d9152 Mon Sep 17 00:00:00 2001 From: Rongwei Liu Date: Thu, 16 Sep 2021 10:46:17 +0300 Subject: net/mlx5: Introduce new device index wrapper Downstream patches. Signed-off-by: Rongwei Liu Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7c8b5f06c2cd..aecc38b90de5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1243,6 +1243,11 @@ static inline int mlx5_core_native_port_num(struct mlx5_core_dev *dev) return MLX5_CAP_GEN(dev, native_port_num); } +static inline int mlx5_get_dev_index(struct mlx5_core_dev *dev) +{ + return PCI_FUNC(dev->pdev->devfn); +} + enum { MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, }; -- cgit v1.2.3 From 1021d0645d593ea86193c5fc371e33e5b208e14d Mon Sep 17 00:00:00 2001 From: Rongwei Liu Date: Tue, 12 Oct 2021 10:40:52 +0300 Subject: net/mlx5: Use native_port_num as 1st option of device index Using "native_port_num" can support more NICs. Fallback to PCIe IDs if "native_port_num" query fails. Signed-off-by: Rongwei Liu Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index aecc38b90de5..cf508685abca 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1245,7 +1245,12 @@ static inline int mlx5_core_native_port_num(struct mlx5_core_dev *dev) static inline int mlx5_get_dev_index(struct mlx5_core_dev *dev) { - return PCI_FUNC(dev->pdev->devfn); + int idx = MLX5_CAP_GEN(dev, native_port_num); + + if (idx >= 1 && idx <= MLX5_MAX_PORTS) + return idx - 1; + else + return PCI_FUNC(dev->pdev->devfn); } enum { -- cgit v1.2.3 From 8e141f9eb803e209714a80aa6ec073893f94c526 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Sep 2021 09:12:40 +0200 Subject: block: drain file system I/O on del_gendisk Instead of delaying draining of file system I/O related items like the blk-qos queues, the integrity read workqueue and timeouts only when the request_queue is removed, do that when del_gendisk is called. This is important for SCSI where the upper level drivers that control the gendisk are separate entities, and the disk can be freed much earlier than the request_queue, or can even be unbound without tearing down the queue. Fixes: edb0872f44ec ("block: move the bdi from the request_queue to the gendisk") Reported-by: Ming Lei Signed-off-by: Christoph Hellwig Tested-by: Darrick J. Wong Link: https://lore.kernel.org/r/20210929071241.934472-5-hch@lst.de Tested-by: Yi Zhang Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c68d83c87f83..0f5315c2b5a3 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -149,6 +149,7 @@ struct gendisk { unsigned long state; #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 +#define GD_DEAD 2 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ -- cgit v1.2.3 From 72bf80cf09c4693780ad93a31b48fa5a4e17a946 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Fri, 15 Oct 2021 12:14:35 -0300 Subject: regulator: lp872x: replacing legacy gpio interface for gpiod MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removing all linux/gpio.h and linux/of_gpio.h dependencies and replacing them with the gpiod interface Signed-off-by: MaĂ­ra Canal Message-Id: Signed-off-by: Mark Brown --- include/linux/regulator/lp872x.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/lp872x.h b/include/linux/regulator/lp872x.h index d780dbb8b423..8e7e0343c6e1 100644 --- a/include/linux/regulator/lp872x.h +++ b/include/linux/regulator/lp872x.h @@ -10,7 +10,7 @@ #include #include -#include +#include #define LP872X_MAX_REGULATORS 9 @@ -41,8 +41,8 @@ enum lp872x_regulator_id { }; enum lp872x_dvs_state { - DVS_LOW = GPIOF_OUT_INIT_LOW, - DVS_HIGH = GPIOF_OUT_INIT_HIGH, + DVS_LOW = GPIOD_OUT_LOW, + DVS_HIGH = GPIOD_OUT_HIGH, }; enum lp872x_dvs_sel { @@ -52,12 +52,12 @@ enum lp872x_dvs_sel { /** * lp872x_dvs - * @gpio : gpio pin number for dvs control + * @gpio : gpio descriptor for dvs control * @vsel : dvs selector for buck v1 or buck v2 register * @init_state : initial dvs pin state */ struct lp872x_dvs { - int gpio; + struct gpio_desc *gpio; enum lp872x_dvs_sel vsel; enum lp872x_dvs_state init_state; }; @@ -78,14 +78,14 @@ struct lp872x_regulator_data { * @update_config : if LP872X_GENERAL_CFG register is updated, set true * @regulator_data : platform regulator id and init data * @dvs : dvs data for buck voltage control - * @enable_gpio : gpio pin number for enable control + * @enable_gpio : gpio descriptor for enable control */ struct lp872x_platform_data { u8 general_config; bool update_config; struct lp872x_regulator_data regulator_data[LP872X_MAX_REGULATORS]; struct lp872x_dvs *dvs; - int enable_gpio; + struct gpio_desc *enable_gpio; }; #endif -- cgit v1.2.3 From c3f69c7f629ff53c75f335754ea41426b8e095de Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 12 Oct 2021 16:35:14 -0700 Subject: scsi: ata: Switch to attribute groups struct device supports attribute groups directly but does not support struct device_attribute directly. Hence switch to attribute groups. Link: https://lore.kernel.org/r/20211012233558.4066756-3-bvanassche@acm.org Acked-by: Damien Le Moal Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- include/linux/libata.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index c0c64f03e107..bd1b782d1bbf 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1388,7 +1388,7 @@ extern int ata_link_nr_enabled(struct ata_link *link); */ extern const struct ata_port_operations ata_base_port_ops; extern const struct ata_port_operations sata_port_ops; -extern struct device_attribute *ata_common_sdev_attrs[]; +extern const struct attribute_group *ata_common_sdev_groups[]; /* * All sht initializers (BASE, PIO, BMDMA, NCQ) must be instantiated @@ -1418,14 +1418,14 @@ extern struct device_attribute *ata_common_sdev_attrs[]; #define ATA_BASE_SHT(drv_name) \ ATA_SUBBASE_SHT(drv_name), \ - .sdev_attrs = ata_common_sdev_attrs + .sdev_groups = ata_common_sdev_groups #ifdef CONFIG_SATA_HOST -extern struct device_attribute *ata_ncq_sdev_attrs[]; +extern const struct attribute_group *ata_ncq_sdev_groups[]; #define ATA_NCQ_SHT(drv_name) \ ATA_SUBBASE_SHT(drv_name), \ - .sdev_attrs = ata_ncq_sdev_attrs, \ + .sdev_groups = ata_ncq_sdev_groups, \ .change_queue_depth = ata_scsi_change_queue_depth #endif -- cgit v1.2.3 From 05593a3fd1037b5fee85d3c8c28112f19e7baa06 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Fri, 27 Aug 2021 12:47:45 +0900 Subject: counter: stm32-lptimer-cnt: Provide defines for clock polarities The STM32 low-power timer permits configuration of the clock polarity via the LPTIMX_CFGR register CKPOL bits. This patch provides preprocessor defines for the supported clock polarities. Cc: Fabrice Gasnier Signed-off-by: William Breathitt Gray Reviewed-by: Fabrice Gasnier Link: https://lore.kernel.org/r/a111c8905c467805ca530728f88189b59430f27e.1630031207.git.vilhelm.gray@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/mfd/stm32-lptimer.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/stm32-lptimer.h b/include/linux/mfd/stm32-lptimer.h index 90b20550c1c8..06d3f11dc3c9 100644 --- a/include/linux/mfd/stm32-lptimer.h +++ b/include/linux/mfd/stm32-lptimer.h @@ -45,6 +45,11 @@ #define STM32_LPTIM_PRESC GENMASK(11, 9) #define STM32_LPTIM_CKPOL GENMASK(2, 1) +/* STM32_LPTIM_CKPOL */ +#define STM32_LPTIM_CKPOL_RISING_EDGE 0 +#define STM32_LPTIM_CKPOL_FALLING_EDGE 1 +#define STM32_LPTIM_CKPOL_BOTH_EDGES 2 + /* STM32_LPTIM_ARR */ #define STM32_LPTIM_MAX_ARR 0xFFFF -- cgit v1.2.3 From ea434ff82649111de4fcabd76187270f8abdb63a Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Fri, 27 Aug 2021 12:47:46 +0900 Subject: counter: stm32-timer-cnt: Provide defines for slave mode selection The STM32 timer permits configuration of the counter encoder mode via the slave mode control register (SMCR) slave mode selection (SMS) bits. This patch provides preprocessor defines for the supported encoder modes. Cc: Fabrice Gasnier Signed-off-by: William Breathitt Gray Reviewed-by: Fabrice Gasnier Link: https://lore.kernel.org/r/ad3d9cd7af580d586316d368f74964cbc394f981.1630031207.git.vilhelm.gray@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/mfd/stm32-timers.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h index f8db83aedb2b..5f5c43fd69dd 100644 --- a/include/linux/mfd/stm32-timers.h +++ b/include/linux/mfd/stm32-timers.h @@ -82,6 +82,10 @@ #define MAX_TIM_ICPSC 0x3 #define TIM_CR2_MMS_SHIFT 4 #define TIM_CR2_MMS2_SHIFT 20 +#define TIM_SMCR_SMS_SLAVE_MODE_DISABLED 0 /* counts on internal clock when CEN=1 */ +#define TIM_SMCR_SMS_ENCODER_MODE_1 1 /* counts TI1FP1 edges, depending on TI2FP2 level */ +#define TIM_SMCR_SMS_ENCODER_MODE_2 2 /* counts TI2FP2 edges, depending on TI1FP1 level */ +#define TIM_SMCR_SMS_ENCODER_MODE_3 3 /* counts on both TI1FP1 and TI2FP2 edges */ #define TIM_SMCR_TS_SHIFT 4 #define TIM_BDTR_BKF_MASK 0xF #define TIM_BDTR_BKF_SHIFT(x) (16 + (x) * 4) -- cgit v1.2.3 From aaec1a0f76ec25f46bbb17b81487c4b0e1c318c5 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Fri, 27 Aug 2021 12:47:47 +0900 Subject: counter: Internalize sysfs interface code This is a reimplementation of the Generic Counter driver interface. There are no modifications to the Counter subsystem userspace interface, so existing userspace applications should continue to run seamlessly. The purpose of this patch is to internalize the sysfs interface code among the various counter drivers into a shared module. Counter drivers pass and take data natively (i.e. u8, u64, etc.) and the shared counter module handles the translation between the sysfs interface and the device drivers. This guarantees a standard userspace interface for all counter drivers, and helps generalize the Generic Counter driver ABI in order to support the Generic Counter chrdev interface (introduced in a subsequent patch) without significant changes to the existing counter drivers. Note, Counter device registration is the same as before: drivers populate a struct counter_device with components and callbacks, then pass the structure to the devm_counter_register function. However, what's different now is how the Counter subsystem code handles this registration internally. Whereas before callbacks would interact directly with sysfs data, this interaction is now abstracted and instead callbacks interact with native C data types. The counter_comp structure forms the basis for Counter extensions. The counter-sysfs.c file contains the code to parse through the counter_device structure and register the requested components and extensions. Attributes are created and populated based on type, with respective translation functions to handle the mapping between sysfs and the counter driver callbacks. The translation performed for each attribute is straightforward: the attribute type and data is parsed from the counter_attribute structure, the respective counter driver read/write callback is called, and sysfs I/O is handled before or after the driver read/write function is called. Cc: Jarkko Nikula Cc: Patrick Havelange Cc: Kamel Bouhara Cc: Maxime Coquelin Cc: Alexandre Torgue Cc: Dan Carpenter Acked-by: Syed Nayyar Waris Reviewed-by: David Lechner Tested-by: David Lechner Signed-off-by: William Breathitt Gray Reviewed-by: Fabrice Gasnier # for stm32 Link: https://lore.kernel.org/r/c68b4a1ffb195c1a2f65e8dd5ad7b7c14e79c6ef.1630031207.git.vilhelm.gray@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/counter.h | 658 ++++++++++++++++++++++--------------------- include/linux/counter_enum.h | 45 --- 2 files changed, 338 insertions(+), 365 deletions(-) delete mode 100644 include/linux/counter_enum.h (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index d16ce2819b48..b69277f5c4c5 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -6,42 +6,184 @@ #ifndef _COUNTER_H_ #define _COUNTER_H_ -#include #include +#include #include +struct counter_device; +struct counter_count; +struct counter_synapse; +struct counter_signal; + +enum counter_comp_type { + COUNTER_COMP_U8, + COUNTER_COMP_U64, + COUNTER_COMP_BOOL, + COUNTER_COMP_SIGNAL_LEVEL, + COUNTER_COMP_FUNCTION, + COUNTER_COMP_SYNAPSE_ACTION, + COUNTER_COMP_ENUM, + COUNTER_COMP_COUNT_DIRECTION, + COUNTER_COMP_COUNT_MODE, +}; + +enum counter_scope { + COUNTER_SCOPE_DEVICE, + COUNTER_SCOPE_SIGNAL, + COUNTER_SCOPE_COUNT, +}; + enum counter_count_direction { - COUNTER_COUNT_DIRECTION_FORWARD = 0, - COUNTER_COUNT_DIRECTION_BACKWARD + COUNTER_COUNT_DIRECTION_FORWARD, + COUNTER_COUNT_DIRECTION_BACKWARD, }; -extern const char *const counter_count_direction_str[2]; enum counter_count_mode { - COUNTER_COUNT_MODE_NORMAL = 0, + COUNTER_COUNT_MODE_NORMAL, COUNTER_COUNT_MODE_RANGE_LIMIT, COUNTER_COUNT_MODE_NON_RECYCLE, - COUNTER_COUNT_MODE_MODULO_N + COUNTER_COUNT_MODE_MODULO_N, }; -extern const char *const counter_count_mode_str[4]; -struct counter_device; -struct counter_signal; +enum counter_function { + COUNTER_FUNCTION_INCREASE, + COUNTER_FUNCTION_DECREASE, + COUNTER_FUNCTION_PULSE_DIRECTION, + COUNTER_FUNCTION_QUADRATURE_X1_A, + COUNTER_FUNCTION_QUADRATURE_X1_B, + COUNTER_FUNCTION_QUADRATURE_X2_A, + COUNTER_FUNCTION_QUADRATURE_X2_B, + COUNTER_FUNCTION_QUADRATURE_X4, +}; + +enum counter_signal_level { + COUNTER_SIGNAL_LEVEL_LOW, + COUNTER_SIGNAL_LEVEL_HIGH, +}; + +enum counter_synapse_action { + COUNTER_SYNAPSE_ACTION_NONE, + COUNTER_SYNAPSE_ACTION_RISING_EDGE, + COUNTER_SYNAPSE_ACTION_FALLING_EDGE, + COUNTER_SYNAPSE_ACTION_BOTH_EDGES, +}; /** - * struct counter_signal_ext - Counter Signal extensions - * @name: attribute name - * @read: read callback for this attribute; may be NULL - * @write: write callback for this attribute; may be NULL - * @priv: data private to the driver + * struct counter_comp - Counter component node + * @type: Counter component data type + * @name: device-specific component name + * @priv: component-relevant data + * @action_read Synapse action mode read callback. The read value of the + * respective Synapse action mode should be passed back via + * the action parameter. + * @device_u8_read Device u8 component read callback. The read value of the + * respective Device u8 component should be passed back via + * the val parameter. + * @count_u8_read Count u8 component read callback. The read value of the + * respective Count u8 component should be passed back via + * the val parameter. + * @signal_u8_read Signal u8 component read callback. The read value of the + * respective Signal u8 component should be passed back via + * the val parameter. + * @device_u32_read Device u32 component read callback. The read value of + * the respective Device u32 component should be passed + * back via the val parameter. + * @count_u32_read Count u32 component read callback. The read value of the + * respective Count u32 component should be passed back via + * the val parameter. + * @signal_u32_read Signal u32 component read callback. The read value of + * the respective Signal u32 component should be passed + * back via the val parameter. + * @device_u64_read Device u64 component read callback. The read value of + * the respective Device u64 component should be passed + * back via the val parameter. + * @count_u64_read Count u64 component read callback. The read value of the + * respective Count u64 component should be passed back via + * the val parameter. + * @signal_u64_read Signal u64 component read callback. The read value of + * the respective Signal u64 component should be passed + * back via the val parameter. + * @action_write Synapse action mode write callback. The write value of + * the respective Synapse action mode is passed via the + * action parameter. + * @device_u8_write Device u8 component write callback. The write value of + * the respective Device u8 component is passed via the val + * parameter. + * @count_u8_write Count u8 component write callback. The write value of + * the respective Count u8 component is passed via the val + * parameter. + * @signal_u8_write Signal u8 component write callback. The write value of + * the respective Signal u8 component is passed via the val + * parameter. + * @device_u32_write Device u32 component write callback. The write value of + * the respective Device u32 component is passed via the + * val parameter. + * @count_u32_write Count u32 component write callback. The write value of + * the respective Count u32 component is passed via the val + * parameter. + * @signal_u32_write Signal u32 component write callback. The write value of + * the respective Signal u32 component is passed via the + * val parameter. + * @device_u64_write Device u64 component write callback. The write value of + * the respective Device u64 component is passed via the + * val parameter. + * @count_u64_write Count u64 component write callback. The write value of + * the respective Count u64 component is passed via the val + * parameter. + * @signal_u64_write Signal u64 component write callback. The write value of + * the respective Signal u64 component is passed via the + * val parameter. */ -struct counter_signal_ext { +struct counter_comp { + enum counter_comp_type type; const char *name; - ssize_t (*read)(struct counter_device *counter, - struct counter_signal *signal, void *priv, char *buf); - ssize_t (*write)(struct counter_device *counter, - struct counter_signal *signal, void *priv, - const char *buf, size_t len); void *priv; + union { + int (*action_read)(struct counter_device *counter, + struct counter_count *count, + struct counter_synapse *synapse, + enum counter_synapse_action *action); + int (*device_u8_read)(struct counter_device *counter, u8 *val); + int (*count_u8_read)(struct counter_device *counter, + struct counter_count *count, u8 *val); + int (*signal_u8_read)(struct counter_device *counter, + struct counter_signal *signal, u8 *val); + int (*device_u32_read)(struct counter_device *counter, + u32 *val); + int (*count_u32_read)(struct counter_device *counter, + struct counter_count *count, u32 *val); + int (*signal_u32_read)(struct counter_device *counter, + struct counter_signal *signal, u32 *val); + int (*device_u64_read)(struct counter_device *counter, + u64 *val); + int (*count_u64_read)(struct counter_device *counter, + struct counter_count *count, u64 *val); + int (*signal_u64_read)(struct counter_device *counter, + struct counter_signal *signal, u64 *val); + }; + union { + int (*action_write)(struct counter_device *counter, + struct counter_count *count, + struct counter_synapse *synapse, + enum counter_synapse_action action); + int (*device_u8_write)(struct counter_device *counter, u8 val); + int (*count_u8_write)(struct counter_device *counter, + struct counter_count *count, u8 val); + int (*signal_u8_write)(struct counter_device *counter, + struct counter_signal *signal, u8 val); + int (*device_u32_write)(struct counter_device *counter, + u32 val); + int (*count_u32_write)(struct counter_device *counter, + struct counter_count *count, u32 val); + int (*signal_u32_write)(struct counter_device *counter, + struct counter_signal *signal, u32 val); + int (*device_u64_write)(struct counter_device *counter, + u64 val); + int (*count_u64_write)(struct counter_device *counter, + struct counter_count *count, u64 val); + int (*signal_u64_write)(struct counter_device *counter, + struct counter_signal *signal, u64 val); + }; }; /** @@ -51,248 +193,52 @@ struct counter_signal_ext { * as it appears in the datasheet documentation * @ext: optional array of Counter Signal extensions * @num_ext: number of Counter Signal extensions specified in @ext - * @priv: optional private data supplied by driver */ struct counter_signal { int id; const char *name; - const struct counter_signal_ext *ext; + struct counter_comp *ext; size_t num_ext; - - void *priv; -}; - -/** - * struct counter_signal_enum_ext - Signal enum extension attribute - * @items: Array of strings - * @num_items: Number of items specified in @items - * @set: Set callback function; may be NULL - * @get: Get callback function; may be NULL - * - * The counter_signal_enum_ext structure can be used to implement enum style - * Signal extension attributes. Enum style attributes are those which have a set - * of strings that map to unsigned integer values. The Generic Counter Signal - * enum extension helper code takes care of mapping between value and string, as - * well as generating a "_available" file which contains a list of all available - * items. The get callback is used to query the currently active item; the index - * of the item within the respective items array is returned via the 'item' - * parameter. The set callback is called when the attribute is updated; the - * 'item' parameter contains the index of the newly activated item within the - * respective items array. - */ -struct counter_signal_enum_ext { - const char * const *items; - size_t num_items; - int (*get)(struct counter_device *counter, - struct counter_signal *signal, size_t *item); - int (*set)(struct counter_device *counter, - struct counter_signal *signal, size_t item); -}; - -/** - * COUNTER_SIGNAL_ENUM() - Initialize Signal enum extension - * @_name: Attribute name - * @_e: Pointer to a counter_signal_enum_ext structure - * - * This should usually be used together with COUNTER_SIGNAL_ENUM_AVAILABLE() - */ -#define COUNTER_SIGNAL_ENUM(_name, _e) \ -{ \ - .name = (_name), \ - .read = counter_signal_enum_read, \ - .write = counter_signal_enum_write, \ - .priv = (_e) \ -} - -/** - * COUNTER_SIGNAL_ENUM_AVAILABLE() - Initialize Signal enum available extension - * @_name: Attribute name ("_available" will be appended to the name) - * @_e: Pointer to a counter_signal_enum_ext structure - * - * Creates a read only attribute that lists all the available enum items in a - * newline separated list. This should usually be used together with - * COUNTER_SIGNAL_ENUM() - */ -#define COUNTER_SIGNAL_ENUM_AVAILABLE(_name, _e) \ -{ \ - .name = (_name "_available"), \ - .read = counter_signal_enum_available_read, \ - .priv = (_e) \ -} - -enum counter_synapse_action { - COUNTER_SYNAPSE_ACTION_NONE = 0, - COUNTER_SYNAPSE_ACTION_RISING_EDGE, - COUNTER_SYNAPSE_ACTION_FALLING_EDGE, - COUNTER_SYNAPSE_ACTION_BOTH_EDGES }; /** * struct counter_synapse - Counter Synapse node - * @action: index of current action mode * @actions_list: array of available action modes * @num_actions: number of action modes specified in @actions_list * @signal: pointer to associated signal */ struct counter_synapse { - size_t action; const enum counter_synapse_action *actions_list; size_t num_actions; struct counter_signal *signal; }; -struct counter_count; - -/** - * struct counter_count_ext - Counter Count extension - * @name: attribute name - * @read: read callback for this attribute; may be NULL - * @write: write callback for this attribute; may be NULL - * @priv: data private to the driver - */ -struct counter_count_ext { - const char *name; - ssize_t (*read)(struct counter_device *counter, - struct counter_count *count, void *priv, char *buf); - ssize_t (*write)(struct counter_device *counter, - struct counter_count *count, void *priv, - const char *buf, size_t len); - void *priv; -}; - -enum counter_function { - COUNTER_FUNCTION_INCREASE = 0, - COUNTER_FUNCTION_DECREASE, - COUNTER_FUNCTION_PULSE_DIRECTION, - COUNTER_FUNCTION_QUADRATURE_X1_A, - COUNTER_FUNCTION_QUADRATURE_X1_B, - COUNTER_FUNCTION_QUADRATURE_X2_A, - COUNTER_FUNCTION_QUADRATURE_X2_B, - COUNTER_FUNCTION_QUADRATURE_X4 -}; - /** * struct counter_count - Counter Count node * @id: unique ID used to identify Count * @name: device-specific Count name; ideally, this should match * the name as it appears in the datasheet documentation - * @function: index of current function mode * @functions_list: array available function modes * @num_functions: number of function modes specified in @functions_list * @synapses: array of synapses for initialization * @num_synapses: number of synapses specified in @synapses * @ext: optional array of Counter Count extensions * @num_ext: number of Counter Count extensions specified in @ext - * @priv: optional private data supplied by driver */ struct counter_count { int id; const char *name; - size_t function; const enum counter_function *functions_list; size_t num_functions; struct counter_synapse *synapses; size_t num_synapses; - const struct counter_count_ext *ext; + struct counter_comp *ext; size_t num_ext; - - void *priv; -}; - -/** - * struct counter_count_enum_ext - Count enum extension attribute - * @items: Array of strings - * @num_items: Number of items specified in @items - * @set: Set callback function; may be NULL - * @get: Get callback function; may be NULL - * - * The counter_count_enum_ext structure can be used to implement enum style - * Count extension attributes. Enum style attributes are those which have a set - * of strings that map to unsigned integer values. The Generic Counter Count - * enum extension helper code takes care of mapping between value and string, as - * well as generating a "_available" file which contains a list of all available - * items. The get callback is used to query the currently active item; the index - * of the item within the respective items array is returned via the 'item' - * parameter. The set callback is called when the attribute is updated; the - * 'item' parameter contains the index of the newly activated item within the - * respective items array. - */ -struct counter_count_enum_ext { - const char * const *items; - size_t num_items; - int (*get)(struct counter_device *counter, struct counter_count *count, - size_t *item); - int (*set)(struct counter_device *counter, struct counter_count *count, - size_t item); -}; - -/** - * COUNTER_COUNT_ENUM() - Initialize Count enum extension - * @_name: Attribute name - * @_e: Pointer to a counter_count_enum_ext structure - * - * This should usually be used together with COUNTER_COUNT_ENUM_AVAILABLE() - */ -#define COUNTER_COUNT_ENUM(_name, _e) \ -{ \ - .name = (_name), \ - .read = counter_count_enum_read, \ - .write = counter_count_enum_write, \ - .priv = (_e) \ -} - -/** - * COUNTER_COUNT_ENUM_AVAILABLE() - Initialize Count enum available extension - * @_name: Attribute name ("_available" will be appended to the name) - * @_e: Pointer to a counter_count_enum_ext structure - * - * Creates a read only attribute that lists all the available enum items in a - * newline separated list. This should usually be used together with - * COUNTER_COUNT_ENUM() - */ -#define COUNTER_COUNT_ENUM_AVAILABLE(_name, _e) \ -{ \ - .name = (_name "_available"), \ - .read = counter_count_enum_available_read, \ - .priv = (_e) \ -} - -/** - * struct counter_device_attr_group - internal container for attribute group - * @attr_group: Counter sysfs attributes group - * @attr_list: list to keep track of created Counter sysfs attributes - * @num_attr: number of Counter sysfs attributes - */ -struct counter_device_attr_group { - struct attribute_group attr_group; - struct list_head attr_list; - size_t num_attr; -}; - -/** - * struct counter_device_state - internal state container for a Counter device - * @id: unique ID used to identify the Counter - * @dev: internal device structure - * @groups_list: attribute groups list (for Signals, Counts, and ext) - * @num_groups: number of attribute groups containers - * @groups: Counter sysfs attribute groups (to populate @dev.groups) - */ -struct counter_device_state { - int id; - struct device dev; - struct counter_device_attr_group *groups_list; - size_t num_groups; - const struct attribute_group **groups; -}; - -enum counter_signal_level { - COUNTER_SIGNAL_LEVEL_LOW, - COUNTER_SIGNAL_LEVEL_HIGH, }; /** @@ -306,117 +252,47 @@ enum counter_signal_level { * @count_write: optional write callback for Count attribute. The write * value for the respective Count is passed in via the val * parameter. - * @function_get: function to get the current count function mode. Returns - * 0 on success and negative error code on error. The index - * of the respective Count's returned function mode should - * be passed back via the function parameter. - * @function_set: function to set the count function mode. function is the - * index of the requested function mode from the respective - * Count's functions_list array. - * @action_get: function to get the current action mode. Returns 0 on - * success and negative error code on error. The index of - * the respective Synapse's returned action mode should be - * passed back via the action parameter. - * @action_set: function to set the action mode. action is the index of - * the requested action mode from the respective Synapse's - * actions_list array. + * @function_read: read callback the Count function modes. The read + * function mode of the respective Count should be passed + * back via the function parameter. + * @function_write: write callback for Count function modes. The function + * mode to write for the respective Count is passed in via + * the function parameter. + * @action_read: read callback the Synapse action modes. The read action + * mode of the respective Synapse should be passed back via + * the action parameter. + * @action_write: write callback for Synapse action modes. The action mode + * to write for the respective Synapse is passed in via the + * action parameter. */ struct counter_ops { int (*signal_read)(struct counter_device *counter, struct counter_signal *signal, enum counter_signal_level *level); int (*count_read)(struct counter_device *counter, - struct counter_count *count, unsigned long *val); + struct counter_count *count, u64 *value); int (*count_write)(struct counter_device *counter, - struct counter_count *count, unsigned long val); - int (*function_get)(struct counter_device *counter, - struct counter_count *count, size_t *function); - int (*function_set)(struct counter_device *counter, - struct counter_count *count, size_t function); - int (*action_get)(struct counter_device *counter, - struct counter_count *count, - struct counter_synapse *synapse, size_t *action); - int (*action_set)(struct counter_device *counter, - struct counter_count *count, - struct counter_synapse *synapse, size_t action); -}; - -/** - * struct counter_device_ext - Counter device extension - * @name: attribute name - * @read: read callback for this attribute; may be NULL - * @write: write callback for this attribute; may be NULL - * @priv: data private to the driver - */ -struct counter_device_ext { - const char *name; - ssize_t (*read)(struct counter_device *counter, void *priv, char *buf); - ssize_t (*write)(struct counter_device *counter, void *priv, - const char *buf, size_t len); - void *priv; + struct counter_count *count, u64 value); + int (*function_read)(struct counter_device *counter, + struct counter_count *count, + enum counter_function *function); + int (*function_write)(struct counter_device *counter, + struct counter_count *count, + enum counter_function function); + int (*action_read)(struct counter_device *counter, + struct counter_count *count, + struct counter_synapse *synapse, + enum counter_synapse_action *action); + int (*action_write)(struct counter_device *counter, + struct counter_count *count, + struct counter_synapse *synapse, + enum counter_synapse_action action); }; -/** - * struct counter_device_enum_ext - Counter enum extension attribute - * @items: Array of strings - * @num_items: Number of items specified in @items - * @set: Set callback function; may be NULL - * @get: Get callback function; may be NULL - * - * The counter_device_enum_ext structure can be used to implement enum style - * Counter extension attributes. Enum style attributes are those which have a - * set of strings that map to unsigned integer values. The Generic Counter enum - * extension helper code takes care of mapping between value and string, as well - * as generating a "_available" file which contains a list of all available - * items. The get callback is used to query the currently active item; the index - * of the item within the respective items array is returned via the 'item' - * parameter. The set callback is called when the attribute is updated; the - * 'item' parameter contains the index of the newly activated item within the - * respective items array. - */ -struct counter_device_enum_ext { - const char * const *items; - size_t num_items; - int (*get)(struct counter_device *counter, size_t *item); - int (*set)(struct counter_device *counter, size_t item); -}; - -/** - * COUNTER_DEVICE_ENUM() - Initialize Counter enum extension - * @_name: Attribute name - * @_e: Pointer to a counter_device_enum_ext structure - * - * This should usually be used together with COUNTER_DEVICE_ENUM_AVAILABLE() - */ -#define COUNTER_DEVICE_ENUM(_name, _e) \ -{ \ - .name = (_name), \ - .read = counter_device_enum_read, \ - .write = counter_device_enum_write, \ - .priv = (_e) \ -} - -/** - * COUNTER_DEVICE_ENUM_AVAILABLE() - Initialize Counter enum available extension - * @_name: Attribute name ("_available" will be appended to the name) - * @_e: Pointer to a counter_device_enum_ext structure - * - * Creates a read only attribute that lists all the available enum items in a - * newline separated list. This should usually be used together with - * COUNTER_DEVICE_ENUM() - */ -#define COUNTER_DEVICE_ENUM_AVAILABLE(_name, _e) \ -{ \ - .name = (_name "_available"), \ - .read = counter_device_enum_available_read, \ - .priv = (_e) \ -} - /** * struct counter_device - Counter data structure * @name: name of the device as it appears in the datasheet * @parent: optional parent device providing the counters - * @device_state: internal device state container * @ops: callbacks from driver * @signals: array of Signals * @num_signals: number of Signals specified in @signals @@ -425,11 +301,11 @@ struct counter_device_enum_ext { * @ext: optional array of Counter device extensions * @num_ext: number of Counter device extensions specified in @ext * @priv: optional private data supplied by driver + * @dev: internal device structure */ struct counter_device { const char *name; struct device *parent; - struct counter_device_state *device_state; const struct counter_ops *ops; @@ -438,17 +314,159 @@ struct counter_device { struct counter_count *counts; size_t num_counts; - const struct counter_device_ext *ext; + struct counter_comp *ext; size_t num_ext; void *priv; + + struct device dev; }; int counter_register(struct counter_device *const counter); void counter_unregister(struct counter_device *const counter); int devm_counter_register(struct device *dev, struct counter_device *const counter); -void devm_counter_unregister(struct device *dev, - struct counter_device *const counter); + +#define COUNTER_COMP_DEVICE_U8(_name, _read, _write) \ +{ \ + .type = COUNTER_COMP_U8, \ + .name = (_name), \ + .device_u8_read = (_read), \ + .device_u8_write = (_write), \ +} +#define COUNTER_COMP_COUNT_U8(_name, _read, _write) \ +{ \ + .type = COUNTER_COMP_U8, \ + .name = (_name), \ + .count_u8_read = (_read), \ + .count_u8_write = (_write), \ +} +#define COUNTER_COMP_SIGNAL_U8(_name, _read, _write) \ +{ \ + .type = COUNTER_COMP_U8, \ + .name = (_name), \ + .signal_u8_read = (_read), \ + .signal_u8_write = (_write), \ +} + +#define COUNTER_COMP_DEVICE_U64(_name, _read, _write) \ +{ \ + .type = COUNTER_COMP_U64, \ + .name = (_name), \ + .device_u64_read = (_read), \ + .device_u64_write = (_write), \ +} +#define COUNTER_COMP_COUNT_U64(_name, _read, _write) \ +{ \ + .type = COUNTER_COMP_U64, \ + .name = (_name), \ + .count_u64_read = (_read), \ + .count_u64_write = (_write), \ +} +#define COUNTER_COMP_SIGNAL_U64(_name, _read, _write) \ +{ \ + .type = COUNTER_COMP_U64, \ + .name = (_name), \ + .signal_u64_read = (_read), \ + .signal_u64_write = (_write), \ +} + +#define COUNTER_COMP_DEVICE_BOOL(_name, _read, _write) \ +{ \ + .type = COUNTER_COMP_BOOL, \ + .name = (_name), \ + .device_u8_read = (_read), \ + .device_u8_write = (_write), \ +} +#define COUNTER_COMP_COUNT_BOOL(_name, _read, _write) \ +{ \ + .type = COUNTER_COMP_BOOL, \ + .name = (_name), \ + .count_u8_read = (_read), \ + .count_u8_write = (_write), \ +} +#define COUNTER_COMP_SIGNAL_BOOL(_name, _read, _write) \ +{ \ + .type = COUNTER_COMP_BOOL, \ + .name = (_name), \ + .signal_u8_read = (_read), \ + .signal_u8_write = (_write), \ +} + +struct counter_available { + union { + const u32 *enums; + const char *const *strs; + }; + size_t num_items; +}; + +#define DEFINE_COUNTER_AVAILABLE(_name, _enums) \ + struct counter_available _name = { \ + .enums = (_enums), \ + .num_items = ARRAY_SIZE(_enums), \ + } + +#define DEFINE_COUNTER_ENUM(_name, _strs) \ + struct counter_available _name = { \ + .strs = (_strs), \ + .num_items = ARRAY_SIZE(_strs), \ + } + +#define COUNTER_COMP_DEVICE_ENUM(_name, _get, _set, _available) \ +{ \ + .type = COUNTER_COMP_ENUM, \ + .name = (_name), \ + .device_u32_read = (_get), \ + .device_u32_write = (_set), \ + .priv = &(_available), \ +} +#define COUNTER_COMP_COUNT_ENUM(_name, _get, _set, _available) \ +{ \ + .type = COUNTER_COMP_ENUM, \ + .name = (_name), \ + .count_u32_read = (_get), \ + .count_u32_write = (_set), \ + .priv = &(_available), \ +} +#define COUNTER_COMP_SIGNAL_ENUM(_name, _get, _set, _available) \ +{ \ + .type = COUNTER_COMP_ENUM, \ + .name = (_name), \ + .signal_u32_read = (_get), \ + .signal_u32_write = (_set), \ + .priv = &(_available), \ +} + +#define COUNTER_COMP_CEILING(_read, _write) \ + COUNTER_COMP_COUNT_U64("ceiling", _read, _write) + +#define COUNTER_COMP_COUNT_MODE(_read, _write, _available) \ +{ \ + .type = COUNTER_COMP_COUNT_MODE, \ + .name = "count_mode", \ + .count_u32_read = (_read), \ + .count_u32_write = (_write), \ + .priv = &(_available), \ +} + +#define COUNTER_COMP_DIRECTION(_read) \ +{ \ + .type = COUNTER_COMP_COUNT_DIRECTION, \ + .name = "direction", \ + .count_u32_read = (_read), \ +} + +#define COUNTER_COMP_ENABLE(_read, _write) \ + COUNTER_COMP_COUNT_BOOL("enable", _read, _write) + +#define COUNTER_COMP_FLOOR(_read, _write) \ + COUNTER_COMP_COUNT_U64("floor", _read, _write) + +#define COUNTER_COMP_PRESET(_read, _write) \ + COUNTER_COMP_COUNT_U64("preset", _read, _write) + +#define COUNTER_COMP_PRESET_ENABLE(_read, _write) \ + COUNTER_COMP_COUNT_BOOL("preset_enable", _read, _write) #endif /* _COUNTER_H_ */ diff --git a/include/linux/counter_enum.h b/include/linux/counter_enum.h deleted file mode 100644 index 9f917298a88f..000000000000 --- a/include/linux/counter_enum.h +++ /dev/null @@ -1,45 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Counter interface enum functions - * Copyright (C) 2018 William Breathitt Gray - */ -#ifndef _COUNTER_ENUM_H_ -#define _COUNTER_ENUM_H_ - -#include - -struct counter_device; -struct counter_signal; -struct counter_count; - -ssize_t counter_signal_enum_read(struct counter_device *counter, - struct counter_signal *signal, void *priv, - char *buf); -ssize_t counter_signal_enum_write(struct counter_device *counter, - struct counter_signal *signal, void *priv, - const char *buf, size_t len); - -ssize_t counter_signal_enum_available_read(struct counter_device *counter, - struct counter_signal *signal, - void *priv, char *buf); - -ssize_t counter_count_enum_read(struct counter_device *counter, - struct counter_count *count, void *priv, - char *buf); -ssize_t counter_count_enum_write(struct counter_device *counter, - struct counter_count *count, void *priv, - const char *buf, size_t len); - -ssize_t counter_count_enum_available_read(struct counter_device *counter, - struct counter_count *count, - void *priv, char *buf); - -ssize_t counter_device_enum_read(struct counter_device *counter, void *priv, - char *buf); -ssize_t counter_device_enum_write(struct counter_device *counter, void *priv, - const char *buf, size_t len); - -ssize_t counter_device_enum_available_read(struct counter_device *counter, - void *priv, char *buf); - -#endif /* _COUNTER_ENUM_H_ */ -- cgit v1.2.3 From 712392f558ef4280b67659fb2d52854dcfc7c8ba Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Fri, 27 Aug 2021 12:47:48 +0900 Subject: counter: Update counter.h comments to reflect sysfs internalization The Counter subsystem architecture and driver implementations have changed in order to handle Counter sysfs interactions in a more consistent way. This patch updates the Generic Counter interface header file comments to reflect the changes. Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/r/19da8ae0c05381b0967c8a334b67f86b814eb880.1630031207.git.vilhelm.gray@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/counter.h | 62 ++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index b69277f5c4c5..445f22d8bfe2 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -188,11 +188,10 @@ struct counter_comp { /** * struct counter_signal - Counter Signal node - * @id: unique ID used to identify signal - * @name: device-specific Signal name; ideally, this should match the name - * as it appears in the datasheet documentation - * @ext: optional array of Counter Signal extensions - * @num_ext: number of Counter Signal extensions specified in @ext + * @id: unique ID used to identify the Signal + * @name: device-specific Signal name + * @ext: optional array of Signal extensions + * @num_ext: number of Signal extensions specified in @ext */ struct counter_signal { int id; @@ -206,7 +205,7 @@ struct counter_signal { * struct counter_synapse - Counter Synapse node * @actions_list: array of available action modes * @num_actions: number of action modes specified in @actions_list - * @signal: pointer to associated signal + * @signal: pointer to the associated Signal */ struct counter_synapse { const enum counter_synapse_action *actions_list; @@ -217,15 +216,14 @@ struct counter_synapse { /** * struct counter_count - Counter Count node - * @id: unique ID used to identify Count - * @name: device-specific Count name; ideally, this should match - * the name as it appears in the datasheet documentation - * @functions_list: array available function modes + * @id: unique ID used to identify the Count + * @name: device-specific Count name + * @functions_list: array of available function modes * @num_functions: number of function modes specified in @functions_list - * @synapses: array of synapses for initialization - * @num_synapses: number of synapses specified in @synapses - * @ext: optional array of Counter Count extensions - * @num_ext: number of Counter Count extensions specified in @ext + * @synapses: array of Synapses for initialization + * @num_synapses: number of Synapses specified in @synapses + * @ext: optional array of Count extensions + * @num_ext: number of Count extensions specified in @ext */ struct counter_count { int id; @@ -243,27 +241,27 @@ struct counter_count { /** * struct counter_ops - Callbacks from driver - * @signal_read: optional read callback for Signal attribute. The read - * level of the respective Signal should be passed back via - * the level parameter. - * @count_read: optional read callback for Count attribute. The read - * value of the respective Count should be passed back via - * the val parameter. - * @count_write: optional write callback for Count attribute. The write - * value for the respective Count is passed in via the val + * @signal_read: optional read callback for Signals. The read level of + * the respective Signal should be passed back via the + * level parameter. + * @count_read: read callback for Counts. The read value of the + * respective Count should be passed back via the value + * parameter. + * @count_write: optional write callback for Counts. The write value for + * the respective Count is passed in via the value * parameter. * @function_read: read callback the Count function modes. The read * function mode of the respective Count should be passed * back via the function parameter. - * @function_write: write callback for Count function modes. The function - * mode to write for the respective Count is passed in via - * the function parameter. - * @action_read: read callback the Synapse action modes. The read action - * mode of the respective Synapse should be passed back via - * the action parameter. - * @action_write: write callback for Synapse action modes. The action mode - * to write for the respective Synapse is passed in via the - * action parameter. + * @function_write: optional write callback for Count function modes. The + * function mode to write for the respective Count is + * passed in via the function parameter. + * @action_read: optional read callback the Synapse action modes. The + * read action mode of the respective Synapse should be + * passed back via the action parameter. + * @action_write: optional write callback for Synapse action modes. The + * action mode to write for the respective Synapse is + * passed in via the action parameter. */ struct counter_ops { int (*signal_read)(struct counter_device *counter, @@ -291,7 +289,7 @@ struct counter_ops { /** * struct counter_device - Counter data structure - * @name: name of the device as it appears in the datasheet + * @name: name of the device * @parent: optional parent device providing the counters * @ops: callbacks from driver * @signals: array of Signals -- cgit v1.2.3 From e65c26f413718ed2e6d788491adcd8cebc0f44b6 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Wed, 29 Sep 2021 12:15:58 +0900 Subject: counter: Move counter enums to uapi header This is in preparation for a subsequent patch implementing a character device interface for the Counter subsystem. Reviewed-by: David Lechner Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/r/962a5f2027fafcf4f77c10e1baf520463960d1ee.1632884256.git.vilhelm.gray@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/counter.h | 42 +----------------------------------------- 1 file changed, 1 insertion(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index 445f22d8bfe2..7c9f7e23953a 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -9,6 +9,7 @@ #include #include #include +#include struct counter_device; struct counter_count; @@ -27,47 +28,6 @@ enum counter_comp_type { COUNTER_COMP_COUNT_MODE, }; -enum counter_scope { - COUNTER_SCOPE_DEVICE, - COUNTER_SCOPE_SIGNAL, - COUNTER_SCOPE_COUNT, -}; - -enum counter_count_direction { - COUNTER_COUNT_DIRECTION_FORWARD, - COUNTER_COUNT_DIRECTION_BACKWARD, -}; - -enum counter_count_mode { - COUNTER_COUNT_MODE_NORMAL, - COUNTER_COUNT_MODE_RANGE_LIMIT, - COUNTER_COUNT_MODE_NON_RECYCLE, - COUNTER_COUNT_MODE_MODULO_N, -}; - -enum counter_function { - COUNTER_FUNCTION_INCREASE, - COUNTER_FUNCTION_DECREASE, - COUNTER_FUNCTION_PULSE_DIRECTION, - COUNTER_FUNCTION_QUADRATURE_X1_A, - COUNTER_FUNCTION_QUADRATURE_X1_B, - COUNTER_FUNCTION_QUADRATURE_X2_A, - COUNTER_FUNCTION_QUADRATURE_X2_B, - COUNTER_FUNCTION_QUADRATURE_X4, -}; - -enum counter_signal_level { - COUNTER_SIGNAL_LEVEL_LOW, - COUNTER_SIGNAL_LEVEL_HIGH, -}; - -enum counter_synapse_action { - COUNTER_SYNAPSE_ACTION_NONE, - COUNTER_SYNAPSE_ACTION_RISING_EDGE, - COUNTER_SYNAPSE_ACTION_FALLING_EDGE, - COUNTER_SYNAPSE_ACTION_BOTH_EDGES, -}; - /** * struct counter_comp - Counter component node * @type: Counter component data type -- cgit v1.2.3 From b6c50affda5957a3629b149a91c7f6688ffce7f7 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Wed, 29 Sep 2021 12:15:59 +0900 Subject: counter: Add character device interface This patch introduces a character device interface for the Counter subsystem. Device data is exposed through standard character device read operations. Device data is gathered when a Counter event is pushed by the respective Counter device driver. Configuration is handled via ioctl operations on the respective Counter character device node. Cc: David Lechner Cc: Gwendal Grignou Cc: Dan Carpenter Cc: Oleksij Rempel Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/r/b8b8c64b4065aedff43699ad1f0e2f8d1419c15b.1632884256.git.vilhelm.gray@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/counter.h | 56 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index 7c9f7e23953a..22b14a552b1d 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -6,9 +6,14 @@ #ifndef _COUNTER_H_ #define _COUNTER_H_ +#include #include #include +#include +#include +#include #include +#include #include struct counter_device; @@ -199,6 +204,20 @@ struct counter_count { size_t num_ext; }; +/** + * struct counter_event_node - Counter Event node + * @l: list of current watching Counter events + * @event: event that triggers + * @channel: event channel + * @comp_list: list of components to watch when event triggers + */ +struct counter_event_node { + struct list_head l; + u8 event; + u8 channel; + struct list_head comp_list; +}; + /** * struct counter_ops - Callbacks from driver * @signal_read: optional read callback for Signals. The read level of @@ -222,6 +241,13 @@ struct counter_count { * @action_write: optional write callback for Synapse action modes. The * action mode to write for the respective Synapse is * passed in via the action parameter. + * @events_configure: optional write callback to configure events. The list of + * struct counter_event_node may be accessed via the + * events_list member of the counter parameter. + * @watch_validate: optional callback to validate a watch. The Counter + * component watch configuration is passed in via the watch + * parameter. A return value of 0 indicates a valid Counter + * component watch configuration. */ struct counter_ops { int (*signal_read)(struct counter_device *counter, @@ -245,6 +271,9 @@ struct counter_ops { struct counter_count *count, struct counter_synapse *synapse, enum counter_synapse_action action); + int (*events_configure)(struct counter_device *counter); + int (*watch_validate)(struct counter_device *counter, + const struct counter_watch *watch); }; /** @@ -260,6 +289,16 @@ struct counter_ops { * @num_ext: number of Counter device extensions specified in @ext * @priv: optional private data supplied by driver * @dev: internal device structure + * @chrdev: internal character device structure + * @events_list: list of current watching Counter events + * @events_list_lock: lock to protect Counter events list operations + * @next_events_list: list of next watching Counter events + * @n_events_list_lock: lock to protect Counter next events list operations + * @events: queue of detected Counter events + * @events_wait: wait queue to allow blocking reads of Counter events + * @events_lock: lock to protect Counter events queue read operations + * @chrdev_lock: lock to limit chrdev to a single open at a time + * @ops_exist_lock: lock to prevent use during removal */ struct counter_device { const char *name; @@ -278,12 +317,29 @@ struct counter_device { void *priv; struct device dev; + struct cdev chrdev; + struct list_head events_list; + spinlock_t events_list_lock; + struct list_head next_events_list; + struct mutex n_events_list_lock; + DECLARE_KFIFO_PTR(events, struct counter_event); + wait_queue_head_t events_wait; + struct mutex events_lock; + /* + * chrdev_lock is locked by counter_chrdev_open() and unlocked by + * counter_chrdev_release(), so a mutex is not possible here because + * chrdev_lock will invariably be held when returning to user space + */ + atomic_t chrdev_lock; + struct mutex ops_exist_lock; }; int counter_register(struct counter_device *const counter); void counter_unregister(struct counter_device *const counter); int devm_counter_register(struct device *dev, struct counter_device *const counter); +void counter_push_event(struct counter_device *const counter, const u8 event, + const u8 channel); #define COUNTER_COMP_DEVICE_U8(_name, _read, _write) \ { \ -- cgit v1.2.3 From 914ff7719e8adaf87131dd7cd0a3520afcf89516 Mon Sep 17 00:00:00 2001 From: Kyung Min Park Date: Thu, 14 Oct 2021 13:38:32 +0800 Subject: iommu/vt-d: Dump DMAR translation structure when DMA fault occurs When the dmar translation fault happens, the kernel prints a single line fault reason with corresponding hexadecimal code defined in the Intel VT-d specification. Currently, when user wants to debug the translation fault in detail, debugfs is used for dumping the dmar_translation_struct, which is not available when the kernel failed to boot. Dump the DMAR translation structure, pagewalk the IO page table and print the page table entry when the fault happens. This takes effect only when CONFIG_DMAR_DEBUG is enabled. Signed-off-by: Kyung Min Park Link: https://lore.kernel.org/r/20210815203845.31287-1-kyung.min.park@intel.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20211014053839.727419-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/dmar.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmar.h b/include/linux/dmar.h index e04436a7ff27..45e903d84733 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -131,6 +131,14 @@ static inline int dmar_res_noop(struct acpi_dmar_header *hdr, void *arg) return 0; } +#ifdef CONFIG_DMAR_DEBUG +void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, + unsigned long long addr, u32 pasid); +#else +static inline void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, + unsigned long long addr, u32 pasid) {} +#endif + #ifdef CONFIG_INTEL_IOMMU extern int iommu_detected, no_iommu; extern int intel_iommu_init(void); -- cgit v1.2.3 From b34380a6d767c54480a937951e6189a7f9699443 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 14 Oct 2021 13:38:33 +0800 Subject: iommu/vt-d: Remove duplicate identity domain flag The iommu_domain data structure already has the "type" field to keep the type of a domain. It's unnecessary to have the DOMAIN_FLAG_STATIC_IDENTITY flag in the vt-d implementation. This cleans it up with no functionality change. Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210926114535.923263-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20211014053839.727419-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 05a65eb155f7..65b15af3cf1a 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -517,9 +517,6 @@ struct context_entry { u64 hi; }; -/* si_domain contains mulitple devices */ -#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0) - /* * When VT-d works in the scalable mode, it allows DMA translation to * happen through either first level or second level page table. This -- cgit v1.2.3 From 37c8041a818dec2cea22ea48a3f90d512a9e1fcd Mon Sep 17 00:00:00 2001 From: "Longpeng(Mike)" Date: Thu, 14 Oct 2021 13:38:38 +0800 Subject: iommu/vt-d: Convert the return type of first_pte_in_page to bool The first_pte_in_page() returns true or false, so let's convert its return type to bool. In addition, use 'IS_ALIGNED' to make the code more readable and neater. Signed-off-by: Longpeng(Mike) Link: https://lore.kernel.org/r/20211008000433.1115-1-longpeng2@huawei.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20211014053839.727419-9-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 65b15af3cf1a..52481625838c 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -705,9 +705,9 @@ static inline bool dma_pte_superpage(struct dma_pte *pte) return (pte->val & DMA_PTE_LARGE_PAGE); } -static inline int first_pte_in_page(struct dma_pte *pte) +static inline bool first_pte_in_page(struct dma_pte *pte) { - return !((unsigned long)pte & ~VTD_PAGE_MASK); + return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE); } extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev); -- cgit v1.2.3 From 9906b9352a35427508d974db3a496eb3c49e2010 Mon Sep 17 00:00:00 2001 From: "Longpeng(Mike)" Date: Thu, 14 Oct 2021 13:38:39 +0800 Subject: iommu/vt-d: Avoid duplicate removing in __domain_mapping() The __domain_mapping() always removes the pages in the range from 'iov_pfn' to 'end_pfn', but the 'end_pfn' is always the last pfn of the range that the caller wants to map. This would introduce too many duplicated removing and leads the map operation take too long, for example: Map iova=0x100000,nr_pages=0x7d61800 iov_pfn: 0x100000, end_pfn: 0x7e617ff iov_pfn: 0x140000, end_pfn: 0x7e617ff iov_pfn: 0x180000, end_pfn: 0x7e617ff iov_pfn: 0x1c0000, end_pfn: 0x7e617ff iov_pfn: 0x200000, end_pfn: 0x7e617ff ... it takes about 50ms in total. We can reduce the cost by recalculate the 'end_pfn' and limit it to the boundary of the end of this pte page. Map iova=0x100000,nr_pages=0x7d61800 iov_pfn: 0x100000, end_pfn: 0x13ffff iov_pfn: 0x140000, end_pfn: 0x17ffff iov_pfn: 0x180000, end_pfn: 0x1bffff iov_pfn: 0x1c0000, end_pfn: 0x1fffff iov_pfn: 0x200000, end_pfn: 0x23ffff ... it only need 9ms now. This also removes a meaningless BUG_ON() in __domain_mapping(). Signed-off-by: Longpeng(Mike) Tested-by: Liujunjie Link: https://lore.kernel.org/r/20211008000433.1115-1-longpeng2@huawei.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20211014053839.727419-10-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 52481625838c..69230fd695ea 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -710,6 +710,12 @@ static inline bool first_pte_in_page(struct dma_pte *pte) return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE); } +static inline int nr_pte_to_next_page(struct dma_pte *pte) +{ + return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) : + (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte; +} + extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev); extern int dmar_find_matched_atsr_unit(struct pci_dev *dev); -- cgit v1.2.3 From 53c36de0701f2fdda6b1d209dc89e2bad4a9c7b8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 30 Dec 2020 10:21:39 -0500 Subject: mm: Add kmap_local_folio() This allows us to map a portion of a folio. Callers can only expect to access up to the next page boundary. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Vlastimil Babka --- include/linux/highmem-internal.h | 11 +++++++++++ include/linux/highmem.h | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 4aa1031d3e4c..0a0b2b09b1b8 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -73,6 +73,12 @@ static inline void *kmap_local_page(struct page *page) return __kmap_local_page_prot(page, kmap_prot); } +static inline void *kmap_local_folio(struct folio *folio, size_t offset) +{ + struct page *page = folio_page(folio, offset / PAGE_SIZE); + return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; +} + static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); @@ -171,6 +177,11 @@ static inline void *kmap_local_page(struct page *page) return page_address(page); } +static inline void *kmap_local_folio(struct folio *folio, size_t offset) +{ + return page_address(&folio->page) + offset; +} + static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return kmap_local_page(page); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index b4c49f9cc379..27cdd715c5f9 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -96,6 +96,43 @@ static inline void kmap_flush_unused(void); */ static inline void *kmap_local_page(struct page *page); +/** + * kmap_local_folio - Map a page in this folio for temporary usage + * @folio: The folio containing the page. + * @offset: The byte offset within the folio which identifies the page. + * + * Requires careful handling when nesting multiple mappings because the map + * management is stack based. The unmap has to be in the reverse order of + * the map operation:: + * + * addr1 = kmap_local_folio(folio1, offset1); + * addr2 = kmap_local_folio(folio2, offset2); + * ... + * kunmap_local(addr2); + * kunmap_local(addr1); + * + * Unmapping addr1 before addr2 is invalid and causes malfunction. + * + * Contrary to kmap() mappings the mapping is only valid in the context of + * the caller and cannot be handed to other contexts. + * + * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the + * virtual address of the direct mapping. Only real highmem pages are + * temporarily mapped. + * + * While it is significantly faster than kmap() for the higmem case it + * comes with restrictions about the pointer validity. Only use when really + * necessary. + * + * On HIGHMEM enabled systems mapping a highmem page has the side effect of + * disabling migration in order to keep the virtual address stable across + * preemption. No caller of kmap_local_folio() can rely on this side effect. + * + * Context: Can be invoked from any context. + * Return: The virtual address of @offset. + */ +static inline void *kmap_local_folio(struct folio *folio, size_t offset); + /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! * @page: Pointer to the page to be mapped -- cgit v1.2.3 From b424de33c42dbd03e39ca8f521126d39acb6c335 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 24 Apr 2021 14:50:36 -0400 Subject: mm: Add arch_make_folio_accessible() As a default implementation, call arch_make_page_accessible n times. If an architecture can do better, it can override this. Also move the default implementation of arch_make_page_accessible() from gfp.h to mm.h. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/gfp.h | 6 ------ include/linux/mm.h | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 55b2ec1f965a..dc5ff40608ce 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -520,12 +520,6 @@ static inline void arch_free_page(struct page *page, int order) { } #ifndef HAVE_ARCH_ALLOC_PAGE static inline void arch_alloc_page(struct page *page, int order) { } #endif -#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE -static inline int arch_make_page_accessible(struct page *page) -{ - return 0; -} -#endif struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); diff --git a/include/linux/mm.h b/include/linux/mm.h index 47143f3e7f0a..c28cace6fe6b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1734,6 +1734,29 @@ static inline size_t folio_size(struct folio *folio) return PAGE_SIZE << folio_order(folio); } +#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE +static inline int arch_make_page_accessible(struct page *page) +{ + return 0; +} +#endif + +#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE +static inline int arch_make_folio_accessible(struct folio *folio) +{ + int ret; + long i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + ret = arch_make_page_accessible(folio_page(folio, i)); + if (ret) + break; + } + + return ret; +} +#endif + /* * Some inline functions in vmstat.h depend on page_zone() */ -- cgit v1.2.3 From 35a020ba0802f732ba070e03e50b140aea4373c4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Apr 2021 09:00:12 -0400 Subject: mm: Add folio_young and folio_idle Idle page tracking is handled through page_ext on 32-bit architectures. Add folio equivalents for 32-bit and move all the page compatibility parts to common code. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Reviewed-by: David Howells --- include/linux/page_idle.h | 99 +++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index d8a6aecf99cb..83abf95e9fa7 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -8,46 +8,16 @@ #ifdef CONFIG_PAGE_IDLE_FLAG -#ifdef CONFIG_64BIT -static inline bool page_is_young(struct page *page) -{ - return PageYoung(page); -} - -static inline void set_page_young(struct page *page) -{ - SetPageYoung(page); -} - -static inline bool test_and_clear_page_young(struct page *page) -{ - return TestClearPageYoung(page); -} - -static inline bool page_is_idle(struct page *page) -{ - return PageIdle(page); -} - -static inline void set_page_idle(struct page *page) -{ - SetPageIdle(page); -} - -static inline void clear_page_idle(struct page *page) -{ - ClearPageIdle(page); -} -#else /* !CONFIG_64BIT */ +#ifndef CONFIG_64BIT /* * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ extern struct page_ext_operations page_idle_ops; -static inline bool page_is_young(struct page *page) +static inline bool folio_test_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return false; @@ -55,9 +25,9 @@ static inline bool page_is_young(struct page *page) return test_bit(PAGE_EXT_YOUNG, &page_ext->flags); } -static inline void set_page_young(struct page *page) +static inline void folio_set_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return; @@ -65,9 +35,9 @@ static inline void set_page_young(struct page *page) set_bit(PAGE_EXT_YOUNG, &page_ext->flags); } -static inline bool test_and_clear_page_young(struct page *page) +static inline bool folio_test_clear_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return false; @@ -75,9 +45,9 @@ static inline bool test_and_clear_page_young(struct page *page) return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); } -static inline bool page_is_idle(struct page *page) +static inline bool folio_test_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return false; @@ -85,9 +55,9 @@ static inline bool page_is_idle(struct page *page) return test_bit(PAGE_EXT_IDLE, &page_ext->flags); } -static inline void set_page_idle(struct page *page) +static inline void folio_set_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return; @@ -95,46 +65,75 @@ static inline void set_page_idle(struct page *page) set_bit(PAGE_EXT_IDLE, &page_ext->flags); } -static inline void clear_page_idle(struct page *page) +static inline void folio_clear_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = lookup_page_ext(&folio->page); if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, &page_ext->flags); } -#endif /* CONFIG_64BIT */ +#endif /* !CONFIG_64BIT */ #else /* !CONFIG_PAGE_IDLE_FLAG */ -static inline bool page_is_young(struct page *page) +static inline bool folio_test_young(struct folio *folio) { return false; } -static inline void set_page_young(struct page *page) +static inline void folio_set_young(struct folio *folio) { } -static inline bool test_and_clear_page_young(struct page *page) +static inline bool folio_test_clear_young(struct folio *folio) { return false; } -static inline bool page_is_idle(struct page *page) +static inline bool folio_test_idle(struct folio *folio) { return false; } -static inline void set_page_idle(struct page *page) +static inline void folio_set_idle(struct folio *folio) { } -static inline void clear_page_idle(struct page *page) +static inline void folio_clear_idle(struct folio *folio) { } #endif /* CONFIG_PAGE_IDLE_FLAG */ +static inline bool page_is_young(struct page *page) +{ + return folio_test_young(page_folio(page)); +} + +static inline void set_page_young(struct page *page) +{ + folio_set_young(page_folio(page)); +} + +static inline bool test_and_clear_page_young(struct page *page) +{ + return folio_test_clear_young(page_folio(page)); +} + +static inline bool page_is_idle(struct page *page) +{ + return folio_test_idle(page_folio(page)); +} + +static inline void set_page_idle(struct page *page) +{ + folio_set_idle(page_folio(page)); +} + +static inline void clear_page_idle(struct page *page) +{ + folio_clear_idle(page_folio(page)); +} #endif /* _LINUX_MM_PAGE_IDLE_H */ -- cgit v1.2.3 From 76580b6529db622964b4ffee59ded25efcb73748 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Apr 2021 10:47:39 -0400 Subject: mm/swap: Add folio_mark_accessed() Convert mark_page_accessed() to folio_mark_accessed(). It already operated on the entire compound page, but now we can avoid calling compound_head quite so many times. Shrinks the function from 424 bytes to 295 bytes (shrinking by 129 bytes). The compatibility wrapper is 30 bytes, plus the 8 bytes for the exported symbol means the kernel shrinks by 91 bytes. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0fc84797623f..067b7af5242c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -352,7 +352,8 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); -extern void mark_page_accessed(struct page *); +void mark_page_accessed(struct page *); +void folio_mark_accessed(struct folio *); extern atomic_t lru_disable_count; -- cgit v1.2.3 From d9c08e2232fbca4fa56b9350f7f84835c4aa3add Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 28 Feb 2021 16:02:57 -0500 Subject: mm/rmap: Add folio_mkclean() Transform page_mkclean() into folio_mkclean() and add a page_mkclean() wrapper around folio_mkclean(). folio_mkclean is 15 bytes smaller than page_mkclean, but the kernel is enlarged by 33 bytes due to inlining page_folio() into each caller. This will go away once the callers are converted to use folio_mkclean(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/rmap.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c976cc6de257..e704b1a4c06c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -235,7 +235,7 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); * * returns the number of cleaned PTEs. */ -int page_mkclean(struct page *); +int folio_mkclean(struct folio *); /* * called in munlock()/munmap() path to check for other vmas holding @@ -295,12 +295,14 @@ static inline void try_to_unmap(struct page *page, enum ttu_flags flags) { } -static inline int page_mkclean(struct page *page) +static inline int folio_mkclean(struct folio *folio) { return 0; } - - #endif /* CONFIG_MMU */ +static inline int page_mkclean(struct page *page) +{ + return folio_mkclean(page_folio(page)); +} #endif /* _LINUX_RMAP_H */ -- cgit v1.2.3 From 3417013e0d183be9b42d794082eec0ec1c5b5f15 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 7 May 2021 07:28:40 -0400 Subject: mm/migrate: Add folio_migrate_mapping() Reimplement migrate_page_move_mapping() as a wrapper around folio_migrate_mapping(). Saves 193 bytes of kernel text. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/migrate.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index c8077e936691..58b3af645e20 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -57,6 +57,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); +int folio_migrate_mapping(struct address_space *mapping, + struct folio *newfolio, struct folio *folio, int extra_count); #else static inline void putback_movable_pages(struct list_head *l) {} -- cgit v1.2.3 From 19138349ed59b90ce58aca319b873eca2e04ad43 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 7 May 2021 15:26:29 -0400 Subject: mm/migrate: Add folio_migrate_flags() Turn migrate_page_states() into a wrapper around folio_migrate_flags(). Also convert two functions only called from folio_migrate_flags() to be folio-based. ksm_migrate_page() becomes folio_migrate_ksm() and copy_page_owner() becomes folio_copy_owner(). folio_migrate_flags() alone shrinks by two thirds -- 1967 bytes down to 642 bytes. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/ksm.h | 4 ++-- include/linux/migrate.h | 1 + include/linux/page_owner.h | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 161e8164abcf..a38a5bca1ba5 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -52,7 +52,7 @@ struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); -void ksm_migrate_page(struct page *newpage, struct page *oldpage); +void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); #else /* !CONFIG_KSM */ @@ -83,7 +83,7 @@ static inline void rmap_walk_ksm(struct page *page, { } -static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) +static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old) { } #endif /* CONFIG_MMU */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 58b3af645e20..aa875b83f7ba 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -57,6 +57,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); +void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count); #else diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 719bfe5108c5..43c638c51c1f 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -12,7 +12,7 @@ extern void __reset_page_owner(struct page *page, unsigned int order); extern void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask); extern void __split_page_owner(struct page *page, unsigned int nr); -extern void __copy_page_owner(struct page *oldpage, struct page *newpage); +extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(const struct page *page); extern void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -36,10 +36,10 @@ static inline void split_page_owner(struct page *page, unsigned int nr) if (static_branch_unlikely(&page_owner_inited)) __split_page_owner(page, nr); } -static inline void copy_page_owner(struct page *oldpage, struct page *newpage) +static inline void folio_copy_owner(struct folio *newfolio, struct folio *old) { if (static_branch_unlikely(&page_owner_inited)) - __copy_page_owner(oldpage, newpage); + __folio_copy_owner(newfolio, old); } static inline void set_page_owner_migrate_reason(struct page *page, int reason) { @@ -63,7 +63,7 @@ static inline void split_page_owner(struct page *page, unsigned int order) { } -static inline void copy_page_owner(struct page *oldpage, struct page *newpage) +static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) { } static inline void set_page_owner_migrate_reason(struct page *page, int reason) -- cgit v1.2.3 From 715cbfd6c5c595bc8b7a6f9ad1fe9fec0122bb20 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 7 May 2021 15:05:06 -0400 Subject: mm/migrate: Add folio_migrate_copy() This is the folio equivalent of migrate_page_copy(), which is retained as a wrapper for filesystems which are not yet converted to folios. Also convert copy_huge_page() to folio_copy(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: Vlastimil Babka --- include/linux/migrate.h | 1 + include/linux/mm.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index aa875b83f7ba..0d2aeb9b0f66 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -58,6 +58,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); +void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count); #else diff --git a/include/linux/mm.h b/include/linux/mm.h index c28cace6fe6b..93d5fbe2e4e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -912,7 +912,7 @@ void __put_page(struct page *page); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); -void copy_huge_page(struct page *dst, struct page *src); +void folio_copy(struct folio *dst, struct folio *src); /* * Compound pages have a destructor function. Provide a -- cgit v1.2.3 From bd3488e7b4d61780eb3dfaca1cc6f4026bcffd48 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 19 Mar 2021 08:39:50 -0400 Subject: mm/writeback: Rename __add_wb_stat() to wb_stat_mod() Make this look like the newly renamed vmstat functions. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/backing-dev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index ac7f231b8825..a62e72dd829f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -64,7 +64,7 @@ static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) return atomic_long_read(&bdi->tot_write_bandwidth); } -static inline void __add_wb_stat(struct bdi_writeback *wb, +static inline void wb_stat_mod(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); @@ -72,12 +72,12 @@ static inline void __add_wb_stat(struct bdi_writeback *wb, static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - __add_wb_stat(wb, item, 1); + wb_stat_mod(wb, item, 1); } static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - __add_wb_stat(wb, item, -1); + wb_stat_mod(wb, item, -1); } static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) -- cgit v1.2.3 From be5f1797523004d0d9aaee81a523d86e3b890007 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 20 Mar 2021 16:34:54 -0400 Subject: flex_proportions: Allow N events instead of 1 When batching events (such as writing back N pages in a single I/O), it is better to do one flex_proportion operation instead of N. There is only one caller of __fprop_inc_percpu_max(), and it's the one we're going to change in the next patch, so rename it instead of adding a compatibility wrapper. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara --- include/linux/flex_proportions.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h index c12df59d3f5f..3e378b1fb0bc 100644 --- a/include/linux/flex_proportions.h +++ b/include/linux/flex_proportions.h @@ -83,9 +83,10 @@ struct fprop_local_percpu { int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp); void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); -void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); -void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, - int max_frac); +void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl, + long nr); +void __fprop_add_percpu_max(struct fprop_global *p, + struct fprop_local_percpu *pl, int max_frac, long nr); void fprop_fraction_percpu(struct fprop_global *p, struct fprop_local_percpu *pl, unsigned long *numerator, unsigned long *denominator); @@ -96,7 +97,7 @@ void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) unsigned long flags; local_irq_save(flags); - __fprop_inc_percpu(p, pl); + __fprop_add_percpu(p, pl, 1); local_irq_restore(flags); } -- cgit v1.2.3 From 269ccca3899f6bce49e004f50f623e0b161fb027 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Jan 2021 23:34:16 -0500 Subject: mm/writeback: Add __folio_end_writeback() test_clear_page_writeback() is actually an mm-internal function, although it's named as if it's a pagecache function. Move it to mm/internal.h, rename it to __folio_end_writeback() and change the return type to bool. The conversion from page to folio is mostly about accounting the number of pages being written back, although it does eliminate a couple of calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/page-flags.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2491e84b8e52..3aebb2060a26 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -657,7 +657,6 @@ static __always_inline void SetPageUptodate(struct page *page) CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) -int test_clear_page_writeback(struct page *page); int __test_set_page_writeback(struct page *page, bool keep_write); #define test_set_page_writeback(page) \ -- cgit v1.2.3 From f143f1ea5a5380b2682e6836fcd24338a2aa82c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 24 Apr 2021 12:00:48 -0400 Subject: mm/writeback: Add folio_start_writeback() Rename set_page_writeback() to folio_start_writeback() to match folio_end_writeback(). Do not bother with wrappers that return void; callers are perfectly capable of ignoring return values. Add wrappers for set_page_writeback(), set_page_writeback_keepwrite() and test_set_page_writeback() for compatibililty with existing filesystems. The main advantage of this patch is getting the statistics right, although it does eliminate a couple of calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/page-flags.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3aebb2060a26..a68af80649a4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -657,21 +657,22 @@ static __always_inline void SetPageUptodate(struct page *page) CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) -int __test_set_page_writeback(struct page *page, bool keep_write); +bool __folio_start_writeback(struct folio *folio, bool keep_write); +bool set_page_writeback(struct page *page); -#define test_set_page_writeback(page) \ - __test_set_page_writeback(page, false) -#define test_set_page_writeback_keepwrite(page) \ - __test_set_page_writeback(page, true) +#define folio_start_writeback(folio) \ + __folio_start_writeback(folio, false) +#define folio_start_writeback_keepwrite(folio) \ + __folio_start_writeback(folio, true) -static inline void set_page_writeback(struct page *page) +static inline void set_page_writeback_keepwrite(struct page *page) { - test_set_page_writeback(page); + folio_start_writeback_keepwrite(page_folio(page)); } -static inline void set_page_writeback_keepwrite(struct page *page) +static inline bool test_set_page_writeback(struct page *page) { - test_set_page_writeback_keepwrite(page); + return set_page_writeback(page); } __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) -- cgit v1.2.3 From b5e84594cafb934e023ee7d4ea4208b6c6b65fcc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 26 Apr 2021 23:53:10 -0400 Subject: mm/writeback: Add folio_mark_dirty() Reimplement set_page_dirty() as a wrapper around folio_mark_dirty(). There is no change to filesystems as they were already being called with the compound_head of the page being marked dirty. We avoid several calls to compound_head(), both statically (through using folio_test_dirty() instead of PageDirty() and dynamically by calling folio_mapping() instead of page_mapping(). Also return bool instead of int to show the range of values actually returned, and add kernel-doc. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 93d5fbe2e4e3..00510b02ffe1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2008,7 +2008,8 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb); -int set_page_dirty(struct page *page); +bool folio_mark_dirty(struct folio *folio); +bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); void __cancel_dirty_page(struct page *page); static inline void cancel_dirty_page(struct page *page) -- cgit v1.2.3 From 203a31516616111b8eaaf00c16fa3fcaa25e6f81 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 11:01:10 -0400 Subject: mm/writeback: Add __folio_mark_dirty() Turn __set_page_dirty() into a wrapper around __folio_mark_dirty(). Convert account_page_dirtied() into folio_account_dirtied() and account the number of pages in the folio to support multi-page folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/memcontrol.h | 5 ++--- include/linux/pagemap.h | 7 ++++++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7bd78c13d1fa..e34bf0cbdf55 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1615,10 +1615,9 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb); -static inline void mem_cgroup_track_foreign_dirty(struct page *page, +static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { - struct folio *folio = page_folio(page); if (mem_cgroup_disabled()) return; @@ -1643,7 +1642,7 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, { } -static inline void mem_cgroup_track_foreign_dirty(struct page *page, +static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bdbd7be67812..aa9a083083df 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -772,8 +772,13 @@ void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); void wait_for_stable_page(struct page *page); void folio_wait_stable(struct folio *folio); +void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); +static inline void __set_page_dirty(struct page *page, + struct address_space *mapping, int warn) +{ + __folio_mark_dirty(page_folio(page), mapping, warn); +} -void __set_page_dirty(struct page *, struct address_space *, int warn); int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); -- cgit v1.2.3 From 85d4d2ebc86f02740c5f5f72ec43cc47d3560720 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 3 May 2021 23:30:44 -0400 Subject: mm/writeback: Add filemap_dirty_folio() Reimplement __set_page_dirty_nobuffers() as a wrapper around filemap_dirty_folio(). Eventually folio_mark_dirty() will pass the folio's mapping to the address space's ->dirty_folio() operation, so add the parameter to filemap_dirty_folio() now. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d1f65adf6a26..d4f24eba066f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -393,6 +393,7 @@ void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); +bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio); void account_page_redirty(struct page *page); void sb_mark_inode_writeback(struct inode *inode); -- cgit v1.2.3 From fc9b6a538b222eaeb4d1e3f93e716b6626d9b653 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 16:12:09 -0400 Subject: mm/writeback: Add folio_account_cleaned() Get the statistics right; compound pages were being accounted as a single page. This didn't matter before now as no filesystem which supported compound pages did writeback. Also move the declaration to pagemap.h since this is part of the page cache. Add a wrapper for account_page_cleaned(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/mm.h | 3 --- include/linux/pagemap.h | 7 +++++++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 00510b02ffe1..f47af4ca4873 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -39,7 +39,6 @@ struct anon_vma_chain; struct file_ra_state; struct user_struct; struct writeback_control; -struct bdi_writeback; struct pt_regs; extern int sysctl_page_lock_unfairness; @@ -2006,8 +2005,6 @@ extern void do_invalidatepage(struct page *page, unsigned int offset, int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); -void account_page_cleaned(struct page *page, struct address_space *mapping, - struct bdi_writeback *wb); bool folio_mark_dirty(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index aa9a083083df..8ba7da513ac4 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -778,6 +778,13 @@ static inline void __set_page_dirty(struct page *page, { __folio_mark_dirty(page_folio(page), mapping, warn); } +void folio_account_cleaned(struct folio *folio, struct address_space *mapping, + struct bdi_writeback *wb); +static inline void account_page_cleaned(struct page *page, + struct address_space *mapping, struct bdi_writeback *wb) +{ + return folio_account_cleaned(page_folio(page), mapping, wb); +} int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); -- cgit v1.2.3 From fdaf532a23795e320c47e1caab50a53c202135c5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Mar 2021 16:43:04 -0500 Subject: mm/writeback: Add folio_cancel_dirty() Turn __cancel_dirty_page() into __folio_cancel_dirty() and add wrappers. Move the prototypes into pagemap.h since this is page cache functionality. Saves 44 bytes of kernel text in total; 33 bytes from __folio_cancel_dirty and 11 from two callers of cancel_dirty_page(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/mm.h | 7 ------- include/linux/pagemap.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f47af4ca4873..3bc394aafbc0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2008,13 +2008,6 @@ int redirty_page_for_writepage(struct writeback_control *wbc, bool folio_mark_dirty(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); -void __cancel_dirty_page(struct page *page); -static inline void cancel_dirty_page(struct page *page) -{ - /* Avoid atomic ops, locking, etc. when not actually needed. */ - if (PageDirty(page)) - __cancel_dirty_page(page); -} int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8ba7da513ac4..8f4a9ba0bfb0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -785,6 +785,17 @@ static inline void account_page_cleaned(struct page *page, { return folio_account_cleaned(page_folio(page), mapping, wb); } +void __folio_cancel_dirty(struct folio *folio); +static inline void folio_cancel_dirty(struct folio *folio) +{ + /* Avoid atomic ops, locking, etc. when not actually needed. */ + if (folio_test_dirty(folio)) + __folio_cancel_dirty(folio); +} +static inline void cancel_dirty_page(struct page *page) +{ + folio_cancel_dirty(page_folio(page)); +} int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); -- cgit v1.2.3 From 9350f20a070d27a6c6a292a2b6f6091e7ea90a65 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 28 Feb 2021 16:21:20 -0500 Subject: mm/writeback: Add folio_clear_dirty_for_io() Transform clear_page_dirty_for_io() into folio_clear_dirty_for_io() and add a compatibility wrapper. Also move the declaration to pagemap.h as this is page cache functionality that doesn't need to be used by the rest of the kernel. Increases the size of the kernel by 79 bytes. While we remove a few calls to compound_head(), we add a call to folio_nr_pages() to get the stats correct for the eventual support of multi-page folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/mm.h | 1 - include/linux/pagemap.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3bc394aafbc0..dd5c6970ba67 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2008,7 +2008,6 @@ int redirty_page_for_writepage(struct writeback_control *wbc, bool folio_mark_dirty(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); -int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8f4a9ba0bfb0..0dda6459d2d1 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -796,6 +796,8 @@ static inline void cancel_dirty_page(struct page *page) { folio_cancel_dirty(page_folio(page)); } +bool folio_clear_dirty_for_io(struct folio *folio); +bool clear_page_dirty_for_io(struct page *page); int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); -- cgit v1.2.3 From 25ff8b15537dfa0e1a62d55cfcc48f3c8bd8a76c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 3 May 2021 10:06:55 -0400 Subject: mm/writeback: Add folio_account_redirty() Account the number of pages in the folio that we're redirtying. Turn account_page_dirty() into a wrapper around it. Also turn the comment on folio_account_redirty() into kernel-doc and edit it slightly so it makes sense to its potential callers. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/writeback.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d4f24eba066f..852100bea351 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -394,7 +394,11 @@ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio); -void account_page_redirty(struct page *page); +void folio_account_redirty(struct folio *folio); +static inline void account_page_redirty(struct page *page) +{ + folio_account_redirty(page_folio(page)); +} void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); -- cgit v1.2.3 From cd78ab11a8810dd297f4751d17cc53e3dce36024 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 2 May 2021 23:22:52 -0400 Subject: mm/writeback: Add folio_redirty_for_writepage() Reimplement redirty_page_for_writepage() as a wrapper around folio_redirty_for_writepage(). Account the number of pages in the folio, add kernel-doc and move the prototype to writeback.h. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/mm.h | 4 ---- include/linux/writeback.h | 2 ++ 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index dd5c6970ba67..23ab040aa65a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -36,9 +36,7 @@ struct mempolicy; struct anon_vma; struct anon_vma_chain; -struct file_ra_state; struct user_struct; -struct writeback_control; struct pt_regs; extern int sysctl_page_lock_unfairness; @@ -2003,8 +2001,6 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length); -int redirty_page_for_writepage(struct writeback_control *wbc, - struct page *page); bool folio_mark_dirty(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 852100bea351..3571f383dfb6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -399,6 +399,8 @@ static inline void account_page_redirty(struct page *page) { folio_account_redirty(page_folio(page)); } +bool folio_redirty_for_writepage(struct writeback_control *, struct folio *); +bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); -- cgit v1.2.3 From 9eb7c76dd31a53331bec795faaf7daa7ffb80071 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Apr 2021 23:11:28 -0400 Subject: mm/filemap: Add i_blocks_per_folio() Reimplement i_blocks_per_page() as a wrapper around i_blocks_per_folio(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/pagemap.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0dda6459d2d1..5431d9920dc0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1149,19 +1149,25 @@ static inline int page_mkwrite_check_truncate(struct page *page, } /** - * i_blocks_per_page - How many blocks fit in this page. + * i_blocks_per_folio - How many blocks fit in this folio. * @inode: The inode which contains the blocks. - * @page: The page (head page if the page is a THP). + * @folio: The folio. * - * If the block size is larger than the size of this page, return zero. + * If the block size is larger than the size of this folio, return zero. * - * Context: The caller should hold a refcount on the page to prevent it + * Context: The caller should hold a refcount on the folio to prevent it * from being split. - * Return: The number of filesystem blocks covered by this page. + * Return: The number of filesystem blocks covered by this folio. */ +static inline +unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio) +{ + return folio_size(folio) >> inode->i_blkbits; +} + static inline unsigned int i_blocks_per_page(struct inode *inode, struct page *page) { - return thp_size(page) >> inode->i_blkbits; + return i_blocks_per_folio(inode, page_folio(page)); } #endif /* _LINUX_PAGEMAP_H */ -- cgit v1.2.3 From f705bf84eab2aa3b9842974b8443587727ccb23a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 28 Apr 2021 22:30:06 -0400 Subject: mm/filemap: Add folio_mkwrite_check_truncate() This is the folio equivalent of page_mkwrite_check_truncate(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Howells --- include/linux/pagemap.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 5431d9920dc0..4b74d83c5571 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1120,6 +1120,34 @@ static inline unsigned long dir_pages(struct inode *inode) PAGE_SHIFT; } +/** + * folio_mkwrite_check_truncate - check if folio was truncated + * @folio: the folio to check + * @inode: the inode to check the folio against + * + * Return: the number of bytes in the folio up to EOF, + * or -EFAULT if the folio was truncated. + */ +static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, + struct inode *inode) +{ + loff_t size = i_size_read(inode); + pgoff_t index = size >> PAGE_SHIFT; + size_t offset = offset_in_folio(folio, size); + + if (!folio->mapping) + return -EFAULT; + + /* folio is wholly inside EOF */ + if (folio_next_index(folio) - 1 < index) + return folio_size(folio); + /* folio is wholly past EOF */ + if (folio->index > index || !offset) + return -EFAULT; + /* folio is partially inside EOF */ + return offset; +} + /** * page_mkwrite_check_truncate - check if page was truncated * @page: the page to check -- cgit v1.2.3 From 9bf70167e3c61473b95f40771decc3778bf0fb9f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Apr 2021 16:37:09 -0400 Subject: mm/filemap: Add readahead_folio() The pointers stored in the page cache are folios, by definition. This change comes with a behaviour change -- callers of readahead_folio() are no longer required to put the page reference themselves. This matches how readpage works, rather than matching how readpages used to work. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/pagemap.h | 54 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4b74d83c5571..fea9615bab35 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -987,33 +987,57 @@ void page_cache_async_readahead(struct address_space *mapping, page_cache_async_ra(&ractl, page, req_count); } +static inline struct folio *__readahead_folio(struct readahead_control *ractl) +{ + struct folio *folio; + + BUG_ON(ractl->_batch_count > ractl->_nr_pages); + ractl->_nr_pages -= ractl->_batch_count; + ractl->_index += ractl->_batch_count; + + if (!ractl->_nr_pages) { + ractl->_batch_count = 0; + return NULL; + } + + folio = xa_load(&ractl->mapping->i_pages, ractl->_index); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + ractl->_batch_count = folio_nr_pages(folio); + + return folio; +} + /** * readahead_page - Get the next page to read. - * @rac: The current readahead request. + * @ractl: The current readahead request. * * Context: The page is locked and has an elevated refcount. The caller * should decreases the refcount once the page has been submitted for I/O * and unlock the page once all I/O to that page has completed. * Return: A pointer to the next page, or %NULL if we are done. */ -static inline struct page *readahead_page(struct readahead_control *rac) +static inline struct page *readahead_page(struct readahead_control *ractl) { - struct page *page; + struct folio *folio = __readahead_folio(ractl); - BUG_ON(rac->_batch_count > rac->_nr_pages); - rac->_nr_pages -= rac->_batch_count; - rac->_index += rac->_batch_count; - - if (!rac->_nr_pages) { - rac->_batch_count = 0; - return NULL; - } + return &folio->page; +} - page = xa_load(&rac->mapping->i_pages, rac->_index); - VM_BUG_ON_PAGE(!PageLocked(page), page); - rac->_batch_count = thp_nr_pages(page); +/** + * readahead_folio - Get the next folio to read. + * @ractl: The current readahead request. + * + * Context: The folio is locked. The caller should unlock the folio once + * all I/O to that folio has completed. + * Return: A pointer to the next folio, or %NULL if we are done. + */ +static inline struct folio *readahead_folio(struct readahead_control *ractl) +{ + struct folio *folio = __readahead_folio(ractl); - return page; + if (folio) + folio_put(folio); + return folio; } static inline unsigned int __readahead_batch(struct readahead_control *rac, -- cgit v1.2.3 From 0995d7e568141226f10f8216aa4965e06ab5db8a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 10:27:16 -0400 Subject: mm/workingset: Convert workingset_refault() to take a folio This nets us 178 bytes of savings from removing calls to compound_head. The three callers all grow a little, but each of them will be converted to use folios soon, so that's fine. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/swap.h | 4 ++-- include/linux/vmstat.h | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 067b7af5242c..892faac942da 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -329,7 +329,7 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio) /* linux/mm/workingset.c */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); -void workingset_refault(struct page *page, void *shadow); +void workingset_refault(struct folio *folio, void *shadow); void workingset_activation(struct folio *folio); /* Only track the nodes of mappings with shadow entries */ @@ -350,7 +350,7 @@ extern unsigned long nr_free_buffer_pages(void); /* linux/mm/swap.c */ extern void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); -extern void lru_note_cost_page(struct page *); +extern void lru_note_cost_folio(struct folio *); extern void lru_cache_add(struct page *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 241bd0f53fb9..bfe38869498d 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -597,12 +597,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void inc_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx) -{ - mod_lruvec_state(lruvec, idx, 1); -} - static inline void __inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { -- cgit v1.2.3 From 0d31125d2d3267ec349796418a16761bbda20387 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 29 Apr 2021 11:09:31 -0400 Subject: mm/lru: Add folio_add_lru() Reimplement lru_cache_add() as a wrapper around folio_add_lru(). Saves 159 bytes of kernel text due to removing calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 892faac942da..cdf0957a88a4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -351,6 +351,7 @@ extern unsigned long nr_free_buffer_pages(void); extern void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); extern void lru_note_cost_folio(struct folio *); +extern void folio_add_lru(struct folio *); extern void lru_cache_add(struct page *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); -- cgit v1.2.3 From cc09cb134124a42fbe3bdcebefdc54e286d8f3e5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 15 Dec 2020 22:55:54 -0500 Subject: mm/page_alloc: Add folio allocation functions The __folio_alloc(), __folio_alloc_node() and folio_alloc() functions are mostly for type safety, but they also ensure that the page allocator allocates a compound page and initialises the deferred list if the page is large enough to have one. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: Vlastimil Babka --- include/linux/gfp.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index dc5ff40608ce..3745efd21cf6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -523,6 +523,8 @@ static inline void arch_alloc_page(struct page *page, int order) { } struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); +struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask); unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, @@ -564,6 +566,15 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) return __alloc_pages(gfp_mask, order, nid, NULL); } +static inline +struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) +{ + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + VM_WARN_ON((gfp & __GFP_THISNODE) && !node_online(nid)); + + return __folio_alloc(gfp, order, nid, NULL); +} + /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, * prefer the current CPU's closest node. Otherwise node must be valid and @@ -580,6 +591,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages(gfp_t gfp, unsigned int order); +struct folio *folio_alloc(gfp_t gfp, unsigned order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage); @@ -590,6 +602,10 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node(numa_node_id(), gfp_mask, order); } +static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) +{ + return __folio_alloc_node(gfp, order, numa_node_id()); +} #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ alloc_pages(gfp_mask, order) #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ -- cgit v1.2.3 From bb3c579e25e5757bc5bac1333f4a56dfebf7cb91 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 15 Dec 2020 23:11:07 -0500 Subject: mm/filemap: Add filemap_alloc_folio Reimplement __page_cache_alloc as a wrapper around filemap_alloc_folio to allow filesystems to be converted at our leisure. Increases kernel text size by 133 bytes, mostly in cachefiles_read_backing_file(). pagecache_get_page() shrinks by 32 bytes, though. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/pagemap.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fea9615bab35..51c190ae3b0b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -262,14 +262,19 @@ static inline void *detach_page_private(struct page *page) } #ifdef CONFIG_NUMA -extern struct page *__page_cache_alloc(gfp_t gfp); +struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); #else -static inline struct page *__page_cache_alloc(gfp_t gfp) +static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) { - return alloc_pages(gfp, 0); + return folio_alloc(gfp, order); } #endif +static inline struct page *__page_cache_alloc(gfp_t gfp) +{ + return &filemap_alloc_folio(gfp, 0)->page; +} + static inline struct page *page_cache_alloc(struct address_space *x) { return __page_cache_alloc(mapping_gfp_mask(x)); -- cgit v1.2.3 From 9dd3d069406cea073fc633e77bc59abbfde8c6c4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 8 Dec 2020 08:56:28 -0500 Subject: mm/filemap: Add filemap_add_folio() Convert __add_to_page_cache_locked() into __filemap_add_folio(). Add an assertion to it that (for !hugetlbfs), the folio is naturally aligned within the file. Move the prototype from mm.h to pagemap.h. Convert add_to_page_cache_lru() into filemap_add_folio(). Add a compatibility wrapper for unconverted callers. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/mm.h | 7 ------- include/linux/pagemap.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 23ab040aa65a..efcb06fc6116 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -213,13 +213,6 @@ int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, loff_t *); -/* - * Any attempt to mark this function as static leads to build failure - * when CONFIG_DEBUG_INFO_BTF is enabled because __add_to_page_cache_locked() - * is referred to by BPF code. This must be visible for error injection. - */ -int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp, void **shadowp); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 51c190ae3b0b..a10eb6dc3866 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -876,9 +876,11 @@ static inline int fault_in_pages_readable(const char __user *uaddr, size_t size) } int add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask); + pgoff_t index, gfp_t gfp); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask); + pgoff_t index, gfp_t gfp); +int filemap_add_folio(struct address_space *mapping, struct folio *folio, + pgoff_t index, gfp_t gfp); extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); @@ -903,6 +905,10 @@ static inline int add_to_page_cache(struct page *page, return error; } +/* Must be non-static for BPF error injection */ +int __filemap_add_folio(struct address_space *mapping, struct folio *folio, + pgoff_t index, gfp_t gfp, void **shadowp); + /** * struct readahead_control - Describes a readahead request. * -- cgit v1.2.3 From 3f0c6a07fee6a1c2618a2e12ffb8e9aa24384fde Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Mar 2021 11:45:35 -0500 Subject: mm/filemap: Add filemap_get_folio filemap_get_folio() is a replacement for find_get_page(). Turn pagecache_get_page() into a wrapper around __filemap_get_folio(). Remove find_lock_head() as this use case is now covered by filemap_get_folio(). Reduces overall kernel size by 209 bytes. __filemap_get_folio() is 316 bytes shorter than pagecache_get_page() was, but the new pagecache_get_page() wrapper is 99 bytes. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/pagemap.h | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a10eb6dc3866..401a90336a2c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -302,8 +302,26 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_HEAD 0x00000080 #define FGP_ENTRY 0x00000100 -struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, - int fgp_flags, gfp_t cache_gfp_mask); +struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp); +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp); + +/** + * filemap_get_folio - Find and get a folio. + * @mapping: The address_space to search. + * @index: The page index. + * + * Looks up the page cache entry at @mapping & @index. If a folio is + * present, it is returned with an increased refcount. + * + * Otherwise, %NULL is returned. + */ +static inline struct folio *filemap_get_folio(struct address_space *mapping, + pgoff_t index) +{ + return __filemap_get_folio(mapping, index, 0, 0); +} /** * find_get_page - find and get a page reference @@ -346,25 +364,6 @@ static inline struct page *find_lock_page(struct address_space *mapping, return pagecache_get_page(mapping, index, FGP_LOCK, 0); } -/** - * find_lock_head - Locate, pin and lock a pagecache page. - * @mapping: The address_space to search. - * @index: The page index. - * - * Looks up the page cache entry at @mapping & @index. If there is a - * page cache page, its head page is returned locked and with an increased - * refcount. - * - * Context: May sleep. - * Return: A struct page which is !PageTail, or %NULL if there is no page - * in the cache for this index. - */ -static inline struct page *find_lock_head(struct address_space *mapping, - pgoff_t index) -{ - return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0); -} - /** * find_or_create_page - locate or add a pagecache page * @mapping: the page's address_space -- cgit v1.2.3 From b27652d935f41793c5e229a1e8b3a8bb3afe3cc1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Dec 2020 12:55:56 -0500 Subject: mm/filemap: Add FGP_STABLE Allow filemap_get_folio() to wait for writeback to complete (if the filesystem wants that behaviour). This is the folio equivalent of grab_cache_page_write_begin(), which is moved into the folio-compat file as a reminder to migrate all the code using it. This paves the way for getting rid of AOP_FLAG_NOFS once grab_cache_page_write_begin() is removed. Kernel grows by 11 bytes. filemap_get_folio() grows by 33 bytes but grab_cache_page_write_begin() shrinks by 22 bytes to make up for it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Howells Acked-by: Vlastimil Babka --- include/linux/pagemap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 401a90336a2c..b68b053e581f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -301,6 +301,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_FOR_MMAP 0x00000040 #define FGP_HEAD 0x00000080 #define FGP_ENTRY 0x00000100 +#define FGP_STABLE 0x00000200 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp); -- cgit v1.2.3 From 121703c1c817b3c77f61002466d0bfca7e39f25d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Mar 2021 13:48:03 -0500 Subject: mm/writeback: Add folio_write_one Transform write_one_page() into folio_write_one() and add a compatibility wrapper. Also move the declaration to pagemap.h as this is page cache functionality that doesn't need to be used by the rest of the kernel. Saves 58 bytes of kernel text. While folio_write_one() is 101 bytes smaller than write_one_page(), the inlined call to page_folio() expands each caller. There are fewer than ten callers so it doesn't seem worth putting a wrapper in the core. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Howells --- include/linux/mm.h | 4 ---- include/linux/pagemap.h | 5 +++++ 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index efcb06fc6116..40ff114aaf9e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2788,10 +2788,6 @@ extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); -/* mm/page-writeback.c */ -int __must_check write_one_page(struct page *page); -void task_dirty_inc(struct task_struct *tsk); - extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b68b053e581f..013cdc90f5fd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -803,6 +803,11 @@ static inline void cancel_dirty_page(struct page *page) } bool folio_clear_dirty_for_io(struct folio *folio); bool clear_page_dirty_for_io(struct page *page); +int __must_check folio_write_one(struct folio *folio); +static inline int __must_check write_one_page(struct page *page) +{ + return folio_write_one(page_folio(page)); +} int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); -- cgit v1.2.3 From f2efdb17928924c9c935c136dea764a081032006 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Sat, 16 Oct 2021 10:49:06 +0200 Subject: u64_stats: Introduce u64_stats_set() Allow to directly set a u64_stats_t value which is used to provide an init function which sets it directly to zero intead of memset() the value. Add u64_stats_set() to the u64_stats API. [bigeasy: commit message. ] Signed-off-by: Ahmed S. Darwish Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/u64_stats_sync.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index e81856c0ba13..e8ec116c916b 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -83,6 +83,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p) return local64_read(&p->v); } +static inline void u64_stats_set(u64_stats_t *p, u64 val) +{ + local64_set(&p->v, val); +} + static inline void u64_stats_add(u64_stats_t *p, unsigned long val) { local64_add(val, &p->v); @@ -104,6 +109,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p) return p->v; } +static inline void u64_stats_set(u64_stats_t *p, u64 val) +{ + p->v = val; +} + static inline void u64_stats_add(u64_stats_t *p, unsigned long val) { p->v += val; -- cgit v1.2.3 From 29cbcd85828372333aa87542c51f2b2b0fd4380c Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Sat, 16 Oct 2021 10:49:10 +0200 Subject: net: sched: Remove Qdisc::running sequence counter The Qdisc::running sequence counter has two uses: 1. Reliably reading qdisc's tc statistics while the qdisc is running (a seqcount read/retry loop at gnet_stats_add_basic()). 2. As a flag, indicating whether the qdisc in question is running (without any retry loops). For the first usage, the Qdisc::running sequence counter write section, qdisc_run_begin() => qdisc_run_end(), covers a much wider area than what is actually needed: the raw qdisc's bstats update. A u64_stats sync point was thus introduced (in previous commits) inside the bstats structure itself. A local u64_stats write section is then started and stopped for the bstats updates. Use that u64_stats sync point mechanism for the bstats read/retry loop at gnet_stats_add_basic(). For the second qdisc->running usage, a __QDISC_STATE_RUNNING bit flag, accessed with atomic bitops, is sufficient. Using a bit flag instead of a sequence counter at qdisc_run_begin/end() and qdisc_is_running() leads to the SMP barriers implicitly added through raw_read_seqcount() and write_seqcount_begin/end() getting removed. All call sites have been surveyed though, and no required ordering was identified. Now that the qdisc->running sequence counter is no longer used, remove it. Note, using u64_stats implies no sequence counter protection for 64-bit architectures. This can lead to the qdisc tc statistics "packets" vs. "bytes" values getting out of sync on rare occasions. The individual values will still be valid. Signed-off-by: Ahmed S. Darwish Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 173984414f38..f9cd6fea213f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1916,7 +1916,6 @@ enum netdev_ml_priv_type { * @sfp_bus: attached &struct sfp_bus structure. * * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock - * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount * * @proto_down: protocol port state information can be sent to the * switch driver and used to set the phys state of the @@ -2250,7 +2249,6 @@ struct net_device { struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; - struct lock_class_key *qdisc_running_key; bool proto_down; unsigned wol_enabled:1; unsigned threaded:1; @@ -2360,13 +2358,11 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, #define netdev_lockdep_set_classes(dev) \ { \ static struct lock_class_key qdisc_tx_busylock_key; \ - static struct lock_class_key qdisc_running_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ static struct lock_class_key dev_addr_list_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ - (dev)->qdisc_running_key = &qdisc_running_key; \ lockdep_set_class(&(dev)->addr_list_lock, \ &dev_addr_list_lock_key); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ -- cgit v1.2.3 From 82a8daaecfd9382e9450a05f86be8a274cf69a27 Mon Sep 17 00:00:00 2001 From: Marc Bonnici Date: Fri, 15 Oct 2021 17:57:42 +0100 Subject: firmware: arm_ffa: Add support for MEM_LEND As part of the FF-A spec, an endpoint is allowed to transfer access of, or lend, a memory region to one or more borrowers. Extend the existing memory sharing implementation to support FF-A MEM_LEND functionality and expose this to other kernel drivers. Note that upon a successful MEM_LEND request the caller must ensure that the memory region specified is not accessed until a successful MEM_RECALIM call has been made. On systems with a hypervisor present this will been enforced, however on systems without a hypervisor the responsibility falls to the calling kernel driver to prevent access. Link: https://lore.kernel.org/r/20211015165742.2513065-1-marc.bonnici@arm.com Reviewed-by: Jens Wiklander Signed-off-by: Marc Bonnici Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 505c679b6a9b..85651e41ded8 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -262,6 +262,8 @@ struct ffa_dev_ops { int (*memory_reclaim)(u64 g_handle, u32 flags); int (*memory_share)(struct ffa_device *dev, struct ffa_mem_ops_args *args); + int (*memory_lend)(struct ffa_device *dev, + struct ffa_mem_ops_args *args); }; #endif /* _LINUX_ARM_FFA_H */ -- cgit v1.2.3 From 348332e000697b4ca82ef96719e02876434b8346 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:12 +0200 Subject: mm: don't include in blk-cgroup.h pulls in blkdev.h and thus pretty much all the block headers. Break this dependency chain by turning wbc_blkcg_css into a macro and dropping the blk-cgroup.h include. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/writeback.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d1f65adf6a26..8eb165760752 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -11,7 +11,6 @@ #include #include #include -#include struct bio; @@ -109,15 +108,12 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc) return flags; } -static inline struct cgroup_subsys_state * -wbc_blkcg_css(struct writeback_control *wbc) -{ #ifdef CONFIG_CGROUP_WRITEBACK - if (wbc->wb) - return wbc->wb->blkcg_css; -#endif - return blkcg_root_css; -} +#define wbc_blkcg_css(wbc) \ + ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css) +#else +#define wbc_blkcg_css(wbc) (blkcg_root_css) +#endif /* CONFIG_CGROUP_WRITEBACK */ /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to -- cgit v1.2.3 From e41d12f539f7c8bac9b0897031fee6cc9158a6be Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:13 +0200 Subject: mm: don't include in There is no need to pull blk-cgroup.h and thus blkdev.h in here, so break the include chain. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index ac7f231b8825..843bf4be675f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -15,10 +15,11 @@ #include #include #include -#include #include #include +struct blkcg; + static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); -- cgit v1.2.3 From ccdf774189b6466457ca9c7de1fe9ed18547d249 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:14 +0200 Subject: mm: don't include in Move inode_to_bdi out of line to avoid having to include blkdev.h. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 843bf4be675f..4ac7ce096013 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -134,20 +133,7 @@ static inline bool writeback_in_progress(struct bdi_writeback *wb) return test_bit(WB_writeback_running, &wb->state); } -static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) -{ - struct super_block *sb; - - if (!inode) - return &noop_backing_dev_info; - - sb = inode->i_sb; -#ifdef CONFIG_BLOCK - if (sb_is_blkdev_sb(sb)) - return I_BDEV(inode)->bd_disk->bdi; -#endif - return sb->s_bdi; -} +struct backing_dev_info *inode_to_bdi(struct inode *inode); static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) { -- cgit v1.2.3 From 1d9433cdd04adf7cfd5e3ef81f5acb29d80aae9b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:19 +0200 Subject: block: remove the unused rq_end_sector macro Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-9-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index ef9ceead3db1..80633f3b600c 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -162,7 +162,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define ELEVATOR_INSERT_FLUSH 5 #define ELEVATOR_INSERT_SORT_MERGE 6 -#define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) #define rb_entry_rq(node) rb_entry((node), struct request, rb_node) #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) -- cgit v1.2.3 From 90138237a56282f99d71130ab7c68903a2eba5a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:20 +0200 Subject: block: remove the unused blk_queue_state enum Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-10-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 12b9dbcc980e..9e367509bea1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -262,11 +262,6 @@ enum blk_eh_timer_return { BLK_EH_RESET_TIMER, /* reset timer and try again */ }; -enum blk_queue_state { - Queue_down, - Queue_up, -}; - #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ -- cgit v1.2.3 From 713e4e11088875bf7aee3df81b167c8b2f6c5d65 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:21 +0200 Subject: block: remove the cmd_size field from struct request_queue Entirely unused. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9e367509bea1..6737e484280d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -544,8 +544,6 @@ struct request_queue { bool mq_sysfs_init_done; - size_t cmd_size; - #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; }; -- cgit v1.2.3 From 9778ac77c2027827ffdbb33d3e936b3a0ae9f0f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:22 +0200 Subject: block: remove the struct blk_queue_ctx forward declaration This type doesn't exist at all, so no need to forward declare it. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-12-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6737e484280d..4bcbb1ae2d66 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -253,8 +253,6 @@ static inline unsigned short req_get_ioprio(struct request *req) #include -struct blk_queue_ctx; - struct bio_vec; enum blk_eh_timer_return { -- cgit v1.2.3 From 2e9bc3465ac54d282b855b073409c2c3a7d1ae00 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:23 +0200 Subject: block: move elevator.h to block/ Except for the features passed to blk_queue_required_elevator_features, elevator.h is only needed internally to the block layer. Move the ELEVATOR_F_* definitions to blkdev.h, and the move elevator.h to block/, dropping all the spurious includes outside of that. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-13-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 ++- include/linux/elevator.h | 180 ----------------------------------------------- 2 files changed, 9 insertions(+), 182 deletions(-) delete mode 100644 include/linux/elevator.h (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4bcbb1ae2d66..d815238f61ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -251,8 +251,6 @@ static inline unsigned short req_get_ioprio(struct request *req) return req->ioprio; } -#include - struct bio_vec; enum blk_eh_timer_return { @@ -1124,6 +1122,15 @@ extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); + +/* + * Elevator features for blk_queue_required_elevator_features: + */ +/* Supports zoned block devices sequential write constraint */ +#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) +/* Supports scheduling on multiple hardware queues */ +#define ELEVATOR_F_MQ_AWARE (1U << 1) + extern void blk_queue_required_elevator_features(struct request_queue *q, unsigned int features); extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, diff --git a/include/linux/elevator.h b/include/linux/elevator.h deleted file mode 100644 index 80633f3b600c..000000000000 --- a/include/linux/elevator.h +++ /dev/null @@ -1,180 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_ELEVATOR_H -#define _LINUX_ELEVATOR_H - -#include -#include - -#ifdef CONFIG_BLOCK - -struct io_cq; -struct elevator_type; -#ifdef CONFIG_BLK_DEBUG_FS -struct blk_mq_debugfs_attr; -#endif - -/* - * Return values from elevator merger - */ -enum elv_merge { - ELEVATOR_NO_MERGE = 0, - ELEVATOR_FRONT_MERGE = 1, - ELEVATOR_BACK_MERGE = 2, - ELEVATOR_DISCARD_MERGE = 3, -}; - -struct blk_mq_alloc_data; -struct blk_mq_hw_ctx; - -struct elevator_mq_ops { - int (*init_sched)(struct request_queue *, struct elevator_type *); - void (*exit_sched)(struct elevator_queue *); - int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); - void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); - void (*depth_updated)(struct blk_mq_hw_ctx *); - - bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); - bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); - int (*request_merge)(struct request_queue *q, struct request **, struct bio *); - void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); - void (*requests_merged)(struct request_queue *, struct request *, struct request *); - void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *); - void (*prepare_request)(struct request *); - void (*finish_request)(struct request *); - void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); - struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); - bool (*has_work)(struct blk_mq_hw_ctx *); - void (*completed_request)(struct request *, u64); - void (*requeue_request)(struct request *); - struct request *(*former_request)(struct request_queue *, struct request *); - struct request *(*next_request)(struct request_queue *, struct request *); - void (*init_icq)(struct io_cq *); - void (*exit_icq)(struct io_cq *); -}; - -#define ELV_NAME_MAX (16) - -struct elv_fs_entry { - struct attribute attr; - ssize_t (*show)(struct elevator_queue *, char *); - ssize_t (*store)(struct elevator_queue *, const char *, size_t); -}; - -/* - * identifies an elevator type, such as AS or deadline - */ -struct elevator_type -{ - /* managed by elevator core */ - struct kmem_cache *icq_cache; - - /* fields provided by elevator implementation */ - struct elevator_mq_ops ops; - - size_t icq_size; /* see iocontext.h */ - size_t icq_align; /* ditto */ - struct elv_fs_entry *elevator_attrs; - const char *elevator_name; - const char *elevator_alias; - const unsigned int elevator_features; - struct module *elevator_owner; -#ifdef CONFIG_BLK_DEBUG_FS - const struct blk_mq_debugfs_attr *queue_debugfs_attrs; - const struct blk_mq_debugfs_attr *hctx_debugfs_attrs; -#endif - - /* managed by elevator core */ - char icq_cache_name[ELV_NAME_MAX + 6]; /* elvname + "_io_cq" */ - struct list_head list; -}; - -#define ELV_HASH_BITS 6 - -void elv_rqhash_del(struct request_queue *q, struct request *rq); -void elv_rqhash_add(struct request_queue *q, struct request *rq); -void elv_rqhash_reposition(struct request_queue *q, struct request *rq); -struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); - -/* - * each queue has an elevator_queue associated with it - */ -struct elevator_queue -{ - struct elevator_type *type; - void *elevator_data; - struct kobject kobj; - struct mutex sysfs_lock; - unsigned int registered:1; - DECLARE_HASHTABLE(hash, ELV_HASH_BITS); -}; - -/* - * block elevator interface - */ -extern enum elv_merge elv_merge(struct request_queue *, struct request **, - struct bio *); -extern void elv_merge_requests(struct request_queue *, struct request *, - struct request *); -extern void elv_merged_request(struct request_queue *, struct request *, - enum elv_merge); -extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, - struct list_head *); -extern struct request *elv_former_request(struct request_queue *, struct request *); -extern struct request *elv_latter_request(struct request_queue *, struct request *); -void elevator_init_mq(struct request_queue *q); - -/* - * io scheduler registration - */ -extern int elv_register(struct elevator_type *); -extern void elv_unregister(struct elevator_type *); - -/* - * io scheduler sysfs switching - */ -extern ssize_t elv_iosched_show(struct request_queue *, char *); -extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); - -extern bool elv_bio_merge_ok(struct request *, struct bio *); -extern struct elevator_queue *elevator_alloc(struct request_queue *, - struct elevator_type *); - -/* - * Helper functions. - */ -extern struct request *elv_rb_former_request(struct request_queue *, struct request *); -extern struct request *elv_rb_latter_request(struct request_queue *, struct request *); - -/* - * rb support functions. - */ -extern void elv_rb_add(struct rb_root *, struct request *); -extern void elv_rb_del(struct rb_root *, struct request *); -extern struct request *elv_rb_find(struct rb_root *, sector_t); - -/* - * Insertion selection - */ -#define ELEVATOR_INSERT_FRONT 1 -#define ELEVATOR_INSERT_BACK 2 -#define ELEVATOR_INSERT_SORT 3 -#define ELEVATOR_INSERT_REQUEUE 4 -#define ELEVATOR_INSERT_FLUSH 5 -#define ELEVATOR_INSERT_SORT_MERGE 6 - -#define rb_entry_rq(node) rb_entry((node), struct request, rb_node) - -#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) -#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) - -/* - * Elevator features. - */ - -/* Supports zoned block devices sequential write constraint */ -#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) -/* Supports scheduling on multiple hardware queues */ -#define ELEVATOR_F_MQ_AWARE (1U << 1) - -#endif /* CONFIG_BLOCK */ -#endif -- cgit v1.2.3 From 3ab0bc78e96bd03a5096e4801550926d81b3e19d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:24 +0200 Subject: block: drop unused includes in Drop various include not actually used in blkdev.h itself. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-14-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d815238f61ed..46a703394f7f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -3,8 +3,6 @@ #define _LINUX_BLKDEV_H #include -#include -#include #include #include #include @@ -12,17 +10,12 @@ #include #include #include -#include -#include #include -#include #include -#include #include #include #include #include -#include #include struct module; -- cgit v1.2.3 From b81e0c2372e65e5627864ba034433b64b2fc73f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:25 +0200 Subject: block: drop unused includes in Drop various include not actually used in genhd.h itself, and move the remaning includes closer together. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-15-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 14 ++------------ include/linux/part_stat.h | 1 + 2 files changed, 3 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 0f5315c2b5a3..0b48a0cf4262 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -12,12 +12,10 @@ #include #include -#include -#include -#include #include #include -#include +#include +#include extern const struct device_type disk_type; extern struct device_type part_type; @@ -26,14 +24,6 @@ extern struct class block_class; #define DISK_MAX_PARTS 256 #define DISK_NAME_LEN 32 -#include -#include -#include -#include -#include -#include -#include - #define PARTITION_META_INFO_VOLNAMELTH 64 /* * Enough for the string representation of any kind of UUID plus NULL. diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index d2558121d48c..6f7949b2fd8d 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -3,6 +3,7 @@ #define _LINUX_PART_STAT_H #include +#include struct disk_stats { u64 nsecs[NR_STAT_GROUPS]; -- cgit v1.2.3 From badf7f64378796d460c79eb0f182fa7282eb65d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:26 +0200 Subject: block: move a few merge helpers out of These are block-layer internal helpers, so move them to block/blk.h and block/blk-merge.c. Also update a comment a bit to use better grammar. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-16-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 64 -------------------------------------------------- 1 file changed, 64 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 46a703394f7f..be534040ca9c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -745,37 +745,6 @@ static inline bool rq_is_sync(struct request *rq) return op_is_sync(rq->cmd_flags); } -static inline bool rq_mergeable(struct request *rq) -{ - if (blk_rq_is_passthrough(rq)) - return false; - - if (req_op(rq) == REQ_OP_FLUSH) - return false; - - if (req_op(rq) == REQ_OP_WRITE_ZEROES) - return false; - - if (req_op(rq) == REQ_OP_ZONE_APPEND) - return false; - - if (rq->cmd_flags & REQ_NOMERGE_FLAGS) - return false; - if (rq->rq_flags & RQF_NOMERGE_FLAGS) - return false; - - return true; -} - -static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) -{ - if (bio_page(a) == bio_page(b) && - bio_offset(a) == bio_offset(b)) - return true; - - return false; -} - static inline unsigned int blk_queue_depth(struct request_queue *q) { if (q->queue_depth) @@ -1030,23 +999,6 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q, return min(q->limits.max_sectors, chunk_sectors); } -static inline unsigned int blk_rq_get_max_sectors(struct request *rq, - sector_t offset) -{ - struct request_queue *q = rq->q; - - if (blk_rq_is_passthrough(rq)) - return q->limits.max_hw_sectors; - - if (!q->limits.chunk_sectors || - req_op(rq) == REQ_OP_DISCARD || - req_op(rq) == REQ_OP_SECURE_ERASE) - return blk_queue_get_max_sectors(q, req_op(rq)); - - return min(blk_max_size_offset(q, offset, 0), - blk_queue_get_max_sectors(q, req_op(rq))); -} - static inline unsigned int blk_rq_count_bios(struct request *rq) { unsigned int nr_bios = 0; @@ -1490,22 +1442,6 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector return offset << SECTOR_SHIFT; } -/* - * Two cases of handling DISCARD merge: - * If max_discard_segments > 1, the driver takes every bio - * as a range and send them to controller together. The ranges - * needn't to be contiguous. - * Otherwise, the bios/requests will be handled as same as - * others which should be contiguous. - */ -static inline bool blk_discard_mergable(struct request *req) -{ - if (req_op(req) == REQ_OP_DISCARD && - queue_max_discard_segments(req->q) > 1) - return true; - return false; -} - static inline int bdev_discard_alignment(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From fe45e630a1035aea94c29016f2598bbde149bbe3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:27 +0200 Subject: block: move integrity handling out of Split the integrity/metadata handling definitions out into a new header. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-17-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 183 ++++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 183 ------------------------------------------ 2 files changed, 183 insertions(+), 183 deletions(-) create mode 100644 include/linux/blk-integrity.h (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h new file mode 100644 index 000000000000..8a038ea0717e --- /dev/null +++ b/include/linux/blk-integrity.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BLK_INTEGRITY_H +#define _LINUX_BLK_INTEGRITY_H + +#include + +struct request; + +enum blk_integrity_flags { + BLK_INTEGRITY_VERIFY = 1 << 0, + BLK_INTEGRITY_GENERATE = 1 << 1, + BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, + BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, +}; + +struct blk_integrity_iter { + void *prot_buf; + void *data_buf; + sector_t seed; + unsigned int data_size; + unsigned short interval; + const char *disk_name; +}; + +typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); +typedef void (integrity_prepare_fn) (struct request *); +typedef void (integrity_complete_fn) (struct request *, unsigned int); + +struct blk_integrity_profile { + integrity_processing_fn *generate_fn; + integrity_processing_fn *verify_fn; + integrity_prepare_fn *prepare_fn; + integrity_complete_fn *complete_fn; + const char *name; +}; + +#ifdef CONFIG_BLK_DEV_INTEGRITY +void blk_integrity_register(struct gendisk *, struct blk_integrity *); +void blk_integrity_unregister(struct gendisk *); +int blk_integrity_compare(struct gendisk *, struct gendisk *); +int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, + struct scatterlist *); +int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); + +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +{ + struct blk_integrity *bi = &disk->queue->integrity; + + if (!bi->profile) + return NULL; + + return bi; +} + +static inline struct blk_integrity * +bdev_get_integrity(struct block_device *bdev) +{ + return blk_get_integrity(bdev->bd_disk); +} + +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) +{ + return q->integrity.profile; +} + +static inline void blk_queue_max_integrity_segments(struct request_queue *q, + unsigned int segs) +{ + q->limits.max_integrity_segments = segs; +} + +static inline unsigned short +queue_max_integrity_segments(const struct request_queue *q) +{ + return q->limits.max_integrity_segments; +} + +/** + * bio_integrity_intervals - Return number of integrity intervals for a bio + * @bi: blk_integrity profile for device + * @sectors: Size of the bio in 512-byte sectors + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the data integrity + * interval size of the storage device. Convert the block layer sectors + * to the appropriate number of integrity intervals. + */ +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) +{ + return sectors >> (bi->interval_exp - 9); +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return bio_integrity_intervals(bi, sectors) * bi->tuple_size; +} + +static inline bool blk_integrity_rq(struct request *rq) +{ + return rq->cmd_flags & REQ_INTEGRITY; +} + +/* + * Return the first bvec that contains integrity data. Only drivers that are + * limited to a single integrity segment should use this helper. + */ +static inline struct bio_vec *rq_integrity_vec(struct request *rq) +{ + if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1)) + return NULL; + return rq->bio->bi_integrity->bip_vec; +} +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static inline int blk_rq_count_integrity_sg(struct request_queue *q, + struct bio *b) +{ + return 0; +} +static inline int blk_rq_map_integrity_sg(struct request_queue *q, + struct bio *b, + struct scatterlist *s) +{ + return 0; +} +static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) +{ + return NULL; +} +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) +{ + return NULL; +} +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) +{ + return false; +} +static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) +{ + return 0; +} +static inline void blk_integrity_register(struct gendisk *d, + struct blk_integrity *b) +{ +} +static inline void blk_integrity_unregister(struct gendisk *d) +{ +} +static inline void blk_queue_max_integrity_segments(struct request_queue *q, + unsigned int segs) +{ +} +static inline unsigned short +queue_max_integrity_segments(const struct request_queue *q) +{ + return 0; +} + +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) +{ + return 0; +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return 0; +} +static inline int blk_integrity_rq(struct request *rq) +{ + return 0; +} + +static inline struct bio_vec *rq_integrity_vec(struct request *rq) +{ + return NULL; +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ +#endif /* _LINUX_BLK_INTEGRITY_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index be534040ca9c..56e60e5c09d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1555,189 +1555,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ MODULE_ALIAS("block-major-" __stringify(major) "-*") -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -enum blk_integrity_flags { - BLK_INTEGRITY_VERIFY = 1 << 0, - BLK_INTEGRITY_GENERATE = 1 << 1, - BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, - BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, -}; - -struct blk_integrity_iter { - void *prot_buf; - void *data_buf; - sector_t seed; - unsigned int data_size; - unsigned short interval; - const char *disk_name; -}; - -typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); -typedef void (integrity_prepare_fn) (struct request *); -typedef void (integrity_complete_fn) (struct request *, unsigned int); - -struct blk_integrity_profile { - integrity_processing_fn *generate_fn; - integrity_processing_fn *verify_fn; - integrity_prepare_fn *prepare_fn; - integrity_complete_fn *complete_fn; - const char *name; -}; - -extern void blk_integrity_register(struct gendisk *, struct blk_integrity *); -extern void blk_integrity_unregister(struct gendisk *); -extern int blk_integrity_compare(struct gendisk *, struct gendisk *); -extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, - struct scatterlist *); -extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); - -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) -{ - struct blk_integrity *bi = &disk->queue->integrity; - - if (!bi->profile) - return NULL; - - return bi; -} - -static inline -struct blk_integrity *bdev_get_integrity(struct block_device *bdev) -{ - return blk_get_integrity(bdev->bd_disk); -} - -static inline bool -blk_integrity_queue_supports_integrity(struct request_queue *q) -{ - return q->integrity.profile; -} - -static inline bool blk_integrity_rq(struct request *rq) -{ - return rq->cmd_flags & REQ_INTEGRITY; -} - -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ - q->limits.max_integrity_segments = segs; -} - -static inline unsigned short -queue_max_integrity_segments(const struct request_queue *q) -{ - return q->limits.max_integrity_segments; -} - -/** - * bio_integrity_intervals - Return number of integrity intervals for a bio - * @bi: blk_integrity profile for device - * @sectors: Size of the bio in 512-byte sectors - * - * Description: The block layer calculates everything in 512 byte - * sectors but integrity metadata is done in terms of the data integrity - * interval size of the storage device. Convert the block layer sectors - * to the appropriate number of integrity intervals. - */ -static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, - unsigned int sectors) -{ - return sectors >> (bi->interval_exp - 9); -} - -static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, - unsigned int sectors) -{ - return bio_integrity_intervals(bi, sectors) * bi->tuple_size; -} - -/* - * Return the first bvec that contains integrity data. Only drivers that are - * limited to a single integrity segment should use this helper. - */ -static inline struct bio_vec *rq_integrity_vec(struct request *rq) -{ - if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1)) - return NULL; - return rq->bio->bi_integrity->bip_vec; -} - -#else /* CONFIG_BLK_DEV_INTEGRITY */ - -struct bio; -struct block_device; -struct gendisk; -struct blk_integrity; - -static inline int blk_integrity_rq(struct request *rq) -{ - return 0; -} -static inline int blk_rq_count_integrity_sg(struct request_queue *q, - struct bio *b) -{ - return 0; -} -static inline int blk_rq_map_integrity_sg(struct request_queue *q, - struct bio *b, - struct scatterlist *s) -{ - return 0; -} -static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) -{ - return NULL; -} -static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) -{ - return NULL; -} -static inline bool -blk_integrity_queue_supports_integrity(struct request_queue *q) -{ - return false; -} -static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) -{ - return 0; -} -static inline void blk_integrity_register(struct gendisk *d, - struct blk_integrity *b) -{ -} -static inline void blk_integrity_unregister(struct gendisk *d) -{ -} -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ -} -static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) -{ - return 0; -} - -static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, - unsigned int sectors) -{ - return 0; -} - -static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, - unsigned int sectors) -{ - return 0; -} - -static inline struct bio_vec *rq_integrity_vec(struct request *rq) -{ - return NULL; -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - #ifdef CONFIG_BLK_INLINE_ENCRYPTION bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q); -- cgit v1.2.3 From 24b83deb29b7f06a5573b65f2ce96f5482755d43 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:28 +0200 Subject: block: move struct request to blk-mq.h struct request is only used by blk-mq drivers, so move it and all related declarations to blk-mq.h. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-18-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 465 ++++++++++++++++++++++++++++++++++++++++++ include/linux/blk_types.h | 2 - include/linux/blkdev.h | 469 +------------------------------------------ include/linux/blktrace_api.h | 2 +- include/linux/t10-pi.h | 2 +- 5 files changed, 468 insertions(+), 472 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 13ba1861e688..bd4086a6f28e 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -6,10 +6,218 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; +#define BLKDEV_MIN_RQ 4 +#define BLKDEV_MAX_RQ 128 /* Default maximum */ + +typedef void (rq_end_io_fn)(struct request *, blk_status_t); + +/* + * request flags */ +typedef __u32 __bitwise req_flags_t; + +/* drive already may have started this one */ +#define RQF_STARTED ((__force req_flags_t)(1 << 1)) +/* may not be passed by ioscheduler */ +#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) +/* request for flush sequence */ +#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) +/* merge of different types, fail separately */ +#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) +/* track inflight for MQ */ +#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) +/* don't call prep for this one */ +#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) +/* vaguely specified driver internal error. Ignored by the block layer */ +#define RQF_FAILED ((__force req_flags_t)(1 << 10)) +/* don't warn about errors */ +#define RQF_QUIET ((__force req_flags_t)(1 << 11)) +/* elevator private data attached */ +#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) +/* account into disk and partition IO statistics */ +#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) +/* runtime pm request */ +#define RQF_PM ((__force req_flags_t)(1 << 15)) +/* on IO scheduler merge hash */ +#define RQF_HASHED ((__force req_flags_t)(1 << 16)) +/* track IO completion time */ +#define RQF_STATS ((__force req_flags_t)(1 << 17)) +/* Look at ->special_vec for the actual data payload instead of the + bio chain. */ +#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) +/* The per-zone write lock is held for this request */ +#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) +/* already slept for hybrid poll */ +#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) +/* ->timeout has been called, don't expire again */ +#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) + +/* flags that prevent us from merging requests: */ +#define RQF_NOMERGE_FLAGS \ + (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) + +enum mq_rq_state { + MQ_RQ_IDLE = 0, + MQ_RQ_IN_FLIGHT = 1, + MQ_RQ_COMPLETE = 2, +}; + +/* + * Try to put the fields that are referenced together in the same cacheline. + * + * If you modify this structure, make sure to update blk_rq_init() and + * especially blk_mq_rq_ctx_init() to take care of the added fields. + */ +struct request { + struct request_queue *q; + struct blk_mq_ctx *mq_ctx; + struct blk_mq_hw_ctx *mq_hctx; + + unsigned int cmd_flags; /* op and common flags */ + req_flags_t rq_flags; + + int tag; + int internal_tag; + + /* the following two fields are internal, NEVER access directly */ + unsigned int __data_len; /* total data len */ + sector_t __sector; /* sector cursor */ + + struct bio *bio; + struct bio *biotail; + + struct list_head queuelist; + + /* + * The hash is used inside the scheduler, and killed once the + * request reaches the dispatch list. The ipi_list is only used + * to queue the request for softirq completion, which is long + * after the request has been unhashed (and even removed from + * the dispatch list). + */ + union { + struct hlist_node hash; /* merge hash */ + struct llist_node ipi_list; + }; + + /* + * The rb_node is only used inside the io scheduler, requests + * are pruned when moved to the dispatch queue. So let the + * completion_data share space with the rb_node. + */ + union { + struct rb_node rb_node; /* sort/lookup */ + struct bio_vec special_vec; + void *completion_data; + int error_count; /* for legacy drivers, don't use */ + }; + + /* + * Three pointers are available for the IO schedulers, if they need + * more they have to dynamically allocate it. Flush requests are + * never put on the IO scheduler. So let the flush fields share + * space with the elevator data. + */ + union { + struct { + struct io_cq *icq; + void *priv[2]; + } elv; + + struct { + unsigned int seq; + struct list_head list; + rq_end_io_fn *saved_end_io; + } flush; + }; + + struct gendisk *rq_disk; + struct block_device *part; +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + /* Time that the first bio started allocating this request. */ + u64 alloc_time_ns; +#endif + /* Time that this request was allocated for this IO. */ + u64 start_time_ns; + /* Time that I/O was submitted to the device. */ + u64 io_start_time_ns; + +#ifdef CONFIG_BLK_WBT + unsigned short wbt_flags; +#endif + /* + * rq sectors used for blk stats. It has the same value + * with blk_rq_sectors(rq), except that it never be zeroed + * by completion. + */ + unsigned short stats_sectors; + + /* + * Number of scatter-gather DMA addr+len pairs after + * physical address coalescing is performed. + */ + unsigned short nr_phys_segments; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + unsigned short nr_integrity_segments; +#endif + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct bio_crypt_ctx *crypt_ctx; + struct blk_ksm_keyslot *crypt_keyslot; +#endif + + unsigned short write_hint; + unsigned short ioprio; + + enum mq_rq_state state; + refcount_t ref; + + unsigned int timeout; + unsigned long deadline; + + union { + struct __call_single_data csd; + u64 fifo_time; + }; + + /* + * completion callback. + */ + rq_end_io_fn *end_io; + void *end_io_data; +}; + +#define req_op(req) \ + ((req)->cmd_flags & REQ_OP_MASK) + +static inline bool blk_rq_is_passthrough(struct request *rq) +{ + return blk_op_is_passthrough(req_op(rq)); +} + +static inline unsigned short req_get_ioprio(struct request *req) +{ + return req->ioprio; +} + +#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) + +#define rq_dma_dir(rq) \ + (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) + +enum blk_eh_timer_return { + BLK_EH_DONE, /* drivers has completed the command */ + BLK_EH_RESET_TIMER, /* reset timer and try again */ +}; + +#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ +#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ + /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device @@ -637,4 +845,261 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio); void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key); +static inline bool rq_is_sync(struct request *rq) +{ + return op_is_sync(rq->cmd_flags); +} + +void blk_rq_init(struct request_queue *q, struct request *rq); +void blk_put_request(struct request *rq); +struct request *blk_get_request(struct request_queue *q, unsigned int op, + blk_mq_req_flags_t flags); +int blk_rq_prep_clone(struct request *rq, struct request *rq_src, + struct bio_set *bs, gfp_t gfp_mask, + int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); +void blk_rq_unprep_clone(struct request *rq); +blk_status_t blk_insert_cloned_request(struct request_queue *q, + struct request *rq); + +struct rq_map_data { + struct page **pages; + int page_order; + int nr_entries; + unsigned long offset; + int null_mapped; + int from_user; +}; + +int blk_rq_map_user(struct request_queue *, struct request *, + struct rq_map_data *, void __user *, unsigned long, gfp_t); +int blk_rq_map_user_iov(struct request_queue *, struct request *, + struct rq_map_data *, const struct iov_iter *, gfp_t); +int blk_rq_unmap_user(struct bio *); +int blk_rq_map_kern(struct request_queue *, struct request *, void *, + unsigned int, gfp_t); +int blk_rq_append_bio(struct request *rq, struct bio *bio); +void blk_execute_rq_nowait(struct gendisk *, struct request *, int, + rq_end_io_fn *); +blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, + int at_head); + +struct req_iterator { + struct bvec_iter iter; + struct bio *bio; +}; + +#define __rq_for_each_bio(_bio, rq) \ + if ((rq->bio)) \ + for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) + +#define rq_for_each_segment(bvl, _rq, _iter) \ + __rq_for_each_bio(_iter.bio, _rq) \ + bio_for_each_segment(bvl, _iter.bio, _iter.iter) + +#define rq_for_each_bvec(bvl, _rq, _iter) \ + __rq_for_each_bio(_iter.bio, _rq) \ + bio_for_each_bvec(bvl, _iter.bio, _iter.iter) + +#define rq_iter_last(bvec, _iter) \ + (_iter.bio->bi_next == NULL && \ + bio_iter_last(bvec, _iter.iter)) + +/* + * blk_rq_pos() : the current sector + * blk_rq_bytes() : bytes left in the entire request + * blk_rq_cur_bytes() : bytes left in the current segment + * blk_rq_err_bytes() : bytes left till the next error boundary + * blk_rq_sectors() : sectors left in the entire request + * blk_rq_cur_sectors() : sectors left in the current segment + * blk_rq_stats_sectors() : sectors of the entire request used for stats + */ +static inline sector_t blk_rq_pos(const struct request *rq) +{ + return rq->__sector; +} + +static inline unsigned int blk_rq_bytes(const struct request *rq) +{ + return rq->__data_len; +} + +static inline int blk_rq_cur_bytes(const struct request *rq) +{ + return rq->bio ? bio_cur_bytes(rq->bio) : 0; +} + +unsigned int blk_rq_err_bytes(const struct request *rq); + +static inline unsigned int blk_rq_sectors(const struct request *rq) +{ + return blk_rq_bytes(rq) >> SECTOR_SHIFT; +} + +static inline unsigned int blk_rq_cur_sectors(const struct request *rq) +{ + return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; +} + +static inline unsigned int blk_rq_stats_sectors(const struct request *rq) +{ + return rq->stats_sectors; +} + +/* + * Some commands like WRITE SAME have a payload or data transfer size which + * is different from the size of the request. Any driver that supports such + * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to + * calculate the data transfer size. + */ +static inline unsigned int blk_rq_payload_bytes(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return rq->special_vec.bv_len; + return blk_rq_bytes(rq); +} + +/* + * Return the first full biovec in the request. The caller needs to check that + * there are any bvecs before calling this helper. + */ +static inline struct bio_vec req_bvec(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return rq->special_vec; + return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); +} + +static inline unsigned int blk_rq_count_bios(struct request *rq) +{ + unsigned int nr_bios = 0; + struct bio *bio; + + __rq_for_each_bio(bio, rq) + nr_bios++; + + return nr_bios; +} + +void blk_steal_bios(struct bio_list *list, struct request *rq); + +/* + * Request completion related functions. + * + * blk_update_request() completes given number of bytes and updates + * the request without completing it. + */ +bool blk_update_request(struct request *rq, blk_status_t error, + unsigned int nr_bytes); +void blk_abort_request(struct request *); + +/* + * Number of physical segments as sent to the device. + * + * Normally this is the number of discontiguous data segments sent by the + * submitter. But for data-less command like discard we might have no + * actual data segments submitted, but the driver might have to add it's + * own special payload. In that case we still return 1 here so that this + * special payload will be mapped. + */ +static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return 1; + return rq->nr_phys_segments; +} + +/* + * Number of discard segments (or ranges) the driver needs to fill in. + * Each discard bio merged into a request is counted as one segment. + */ +static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) +{ + return max_t(unsigned short, rq->nr_phys_segments, 1); +} + +int __blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist, struct scatterlist **last_sg); +static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist) +{ + struct scatterlist *last_sg = NULL; + + return __blk_rq_map_sg(q, rq, sglist, &last_sg); +} +void blk_dump_rq_flags(struct request *, char *); + +#ifdef CONFIG_BLK_DEV_ZONED +static inline unsigned int blk_rq_zone_no(struct request *rq) +{ + return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); +} + +static inline unsigned int blk_rq_zone_is_seq(struct request *rq) +{ + return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); +} + +bool blk_req_needs_zone_write_lock(struct request *rq); +bool blk_req_zone_write_trylock(struct request *rq); +void __blk_req_zone_write_lock(struct request *rq); +void __blk_req_zone_write_unlock(struct request *rq); + +static inline void blk_req_zone_write_lock(struct request *rq) +{ + if (blk_req_needs_zone_write_lock(rq)) + __blk_req_zone_write_lock(rq); +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ + if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) + __blk_req_zone_write_unlock(rq); +} + +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return rq->q->seq_zones_wlock && + test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + if (!blk_req_needs_zone_write_lock(rq)) + return true; + return !blk_req_zone_is_write_locked(rq); +} +#else /* CONFIG_BLK_DEV_ZONED */ +static inline bool blk_req_needs_zone_write_lock(struct request *rq) +{ + return false; +} + +static inline void blk_req_zone_write_lock(struct request *rq) +{ +} + +static inline void blk_req_zone_write_unlock(struct request *rq) +{ +} +static inline bool blk_req_zone_is_write_locked(struct request *rq) +{ + return false; +} + +static inline bool blk_req_can_dispatch_to_zone(struct request *rq) +{ + return true; +} +#endif /* CONFIG_BLK_DEV_ZONED */ + +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" #endif +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +void rq_flush_dcache_pages(struct request *rq); +#else +static inline void rq_flush_dcache_pages(struct request *rq) +{ +} +#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ +#endif /* BLK_MQ_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index be622b5a21ed..3b967053e9f5 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -431,8 +431,6 @@ enum stat_group { #define bio_op(bio) \ ((bio)->bi_opf & REQ_OP_MASK) -#define req_op(req) \ - ((req)->cmd_flags & REQ_OP_MASK) /* obsolete, don't use in new code */ static inline void bio_set_op_attrs(struct bio *bio, unsigned op, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 56e60e5c09d0..0e960d74615e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -32,9 +31,6 @@ struct blk_queue_stats; struct blk_stat_callback; struct blk_keyslot_manager; -#define BLKDEV_MIN_RQ 4 -#define BLKDEV_MAX_RQ 128 /* Default maximum */ - /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 @@ -47,213 +43,12 @@ struct blk_keyslot_manager; */ #define BLKCG_MAX_POLS 6 -typedef void (rq_end_io_fn)(struct request *, blk_status_t); - -/* - * request flags */ -typedef __u32 __bitwise req_flags_t; - -/* drive already may have started this one */ -#define RQF_STARTED ((__force req_flags_t)(1 << 1)) -/* may not be passed by ioscheduler */ -#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) -/* request for flush sequence */ -#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) -/* merge of different types, fail separately */ -#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) -/* track inflight for MQ */ -#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) -/* don't call prep for this one */ -#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) -/* vaguely specified driver internal error. Ignored by the block layer */ -#define RQF_FAILED ((__force req_flags_t)(1 << 10)) -/* don't warn about errors */ -#define RQF_QUIET ((__force req_flags_t)(1 << 11)) -/* elevator private data attached */ -#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) -/* account into disk and partition IO statistics */ -#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) -/* runtime pm request */ -#define RQF_PM ((__force req_flags_t)(1 << 15)) -/* on IO scheduler merge hash */ -#define RQF_HASHED ((__force req_flags_t)(1 << 16)) -/* track IO completion time */ -#define RQF_STATS ((__force req_flags_t)(1 << 17)) -/* Look at ->special_vec for the actual data payload instead of the - bio chain. */ -#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) -/* The per-zone write lock is held for this request */ -#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) -/* already slept for hybrid poll */ -#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) -/* ->timeout has been called, don't expire again */ -#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) - -/* flags that prevent us from merging requests: */ -#define RQF_NOMERGE_FLAGS \ - (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) - -/* - * Request state for blk-mq. - */ -enum mq_rq_state { - MQ_RQ_IDLE = 0, - MQ_RQ_IN_FLIGHT = 1, - MQ_RQ_COMPLETE = 2, -}; - -/* - * Try to put the fields that are referenced together in the same cacheline. - * - * If you modify this structure, make sure to update blk_rq_init() and - * especially blk_mq_rq_ctx_init() to take care of the added fields. - */ -struct request { - struct request_queue *q; - struct blk_mq_ctx *mq_ctx; - struct blk_mq_hw_ctx *mq_hctx; - - unsigned int cmd_flags; /* op and common flags */ - req_flags_t rq_flags; - - int tag; - int internal_tag; - - /* the following two fields are internal, NEVER access directly */ - unsigned int __data_len; /* total data len */ - sector_t __sector; /* sector cursor */ - - struct bio *bio; - struct bio *biotail; - - struct list_head queuelist; - - /* - * The hash is used inside the scheduler, and killed once the - * request reaches the dispatch list. The ipi_list is only used - * to queue the request for softirq completion, which is long - * after the request has been unhashed (and even removed from - * the dispatch list). - */ - union { - struct hlist_node hash; /* merge hash */ - struct llist_node ipi_list; - }; - - /* - * The rb_node is only used inside the io scheduler, requests - * are pruned when moved to the dispatch queue. So let the - * completion_data share space with the rb_node. - */ - union { - struct rb_node rb_node; /* sort/lookup */ - struct bio_vec special_vec; - void *completion_data; - int error_count; /* for legacy drivers, don't use */ - }; - - /* - * Three pointers are available for the IO schedulers, if they need - * more they have to dynamically allocate it. Flush requests are - * never put on the IO scheduler. So let the flush fields share - * space with the elevator data. - */ - union { - struct { - struct io_cq *icq; - void *priv[2]; - } elv; - - struct { - unsigned int seq; - struct list_head list; - rq_end_io_fn *saved_end_io; - } flush; - }; - - struct gendisk *rq_disk; - struct block_device *part; -#ifdef CONFIG_BLK_RQ_ALLOC_TIME - /* Time that the first bio started allocating this request. */ - u64 alloc_time_ns; -#endif - /* Time that this request was allocated for this IO. */ - u64 start_time_ns; - /* Time that I/O was submitted to the device. */ - u64 io_start_time_ns; - -#ifdef CONFIG_BLK_WBT - unsigned short wbt_flags; -#endif - /* - * rq sectors used for blk stats. It has the same value - * with blk_rq_sectors(rq), except that it never be zeroed - * by completion. - */ - unsigned short stats_sectors; - - /* - * Number of scatter-gather DMA addr+len pairs after - * physical address coalescing is performed. - */ - unsigned short nr_phys_segments; - -#if defined(CONFIG_BLK_DEV_INTEGRITY) - unsigned short nr_integrity_segments; -#endif - -#ifdef CONFIG_BLK_INLINE_ENCRYPTION - struct bio_crypt_ctx *crypt_ctx; - struct blk_ksm_keyslot *crypt_keyslot; -#endif - - unsigned short write_hint; - unsigned short ioprio; - - enum mq_rq_state state; - refcount_t ref; - - unsigned int timeout; - unsigned long deadline; - - union { - struct __call_single_data csd; - u64 fifo_time; - }; - - /* - * completion callback. - */ - rq_end_io_fn *end_io; - void *end_io_data; -}; - static inline bool blk_op_is_passthrough(unsigned int op) { op &= REQ_OP_MASK; return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; } -static inline bool blk_rq_is_passthrough(struct request *rq) -{ - return blk_op_is_passthrough(req_op(rq)); -} - -static inline unsigned short req_get_ioprio(struct request *req) -{ - return req->ioprio; -} - -struct bio_vec; - -enum blk_eh_timer_return { - BLK_EH_DONE, /* drivers has completed the command */ - BLK_EH_RESET_TIMER, /* reset timer and try again */ -}; - -#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ -#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ - /* * Zoned block device models (zoned limit). * @@ -620,11 +415,6 @@ extern void blk_clear_pm_only(struct request_queue *q); #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) -#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) - -#define rq_dma_dir(rq) \ - (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) - #define dma_map_bvec(dev, bv, dir, attrs) \ dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ (dir), (attrs)) @@ -740,11 +530,6 @@ static inline unsigned int queue_max_active_zones(const struct request_queue *q) } #endif /* CONFIG_BLK_DEV_ZONED */ -static inline bool rq_is_sync(struct request *rq) -{ - return op_is_sync(rq->cmd_flags); -} - static inline unsigned int blk_queue_depth(struct request_queue *q) { if (q->queue_depth) @@ -759,83 +544,20 @@ static inline unsigned int blk_queue_depth(struct request_queue *q) #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) #define BLK_MIN_SG_TIMEOUT (7 * HZ) -struct rq_map_data { - struct page **pages; - int page_order; - int nr_entries; - unsigned long offset; - int null_mapped; - int from_user; -}; - -struct req_iterator { - struct bvec_iter iter; - struct bio *bio; -}; - /* This should not be used directly - use rq_for_each_segment */ #define for_each_bio(_bio) \ for (; _bio; _bio = _bio->bi_next) -#define __rq_for_each_bio(_bio, rq) \ - if ((rq->bio)) \ - for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) - -#define rq_for_each_segment(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bio_for_each_segment(bvl, _iter.bio, _iter.iter) -#define rq_for_each_bvec(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bio_for_each_bvec(bvl, _iter.bio, _iter.iter) - -#define rq_iter_last(bvec, _iter) \ - (_iter.bio->bi_next == NULL && \ - bio_iter_last(bvec, _iter.iter)) - -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" -#endif -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -extern void rq_flush_dcache_pages(struct request *rq); -#else -static inline void rq_flush_dcache_pages(struct request *rq) -{ -} -#endif extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); blk_qc_t submit_bio_noacct(struct bio *bio); -extern void blk_rq_init(struct request_queue *q, struct request *rq); -extern void blk_put_request(struct request *); -extern struct request *blk_get_request(struct request_queue *, unsigned int op, - blk_mq_req_flags_t flags); + extern int blk_lld_busy(struct request_queue *q); -extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data); -extern void blk_rq_unprep_clone(struct request *rq); -extern blk_status_t blk_insert_cloned_request(struct request_queue *q, - struct request *rq); -int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_queue_split(struct bio **); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -extern int blk_rq_map_user(struct request_queue *, struct request *, - struct rq_map_data *, void __user *, unsigned long, - gfp_t); -extern int blk_rq_unmap_user(struct bio *); -extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); -extern int blk_rq_map_user_iov(struct request_queue *, struct request *, - struct rq_map_data *, const struct iov_iter *, - gfp_t); -extern void blk_execute_rq_nowait(struct gendisk *, - struct request *, int, rq_end_io_fn *); - -blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, - int at_head); /* Helper to convert REQ_OP_XXX to its string format XXX */ extern const char *blk_op_str(unsigned int op); @@ -867,47 +589,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) #define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) #define SECTOR_MASK (PAGE_SECTORS - 1) -/* - * blk_rq_pos() : the current sector - * blk_rq_bytes() : bytes left in the entire request - * blk_rq_cur_bytes() : bytes left in the current segment - * blk_rq_err_bytes() : bytes left till the next error boundary - * blk_rq_sectors() : sectors left in the entire request - * blk_rq_cur_sectors() : sectors left in the current segment - * blk_rq_stats_sectors() : sectors of the entire request used for stats - */ -static inline sector_t blk_rq_pos(const struct request *rq) -{ - return rq->__sector; -} - -static inline unsigned int blk_rq_bytes(const struct request *rq) -{ - return rq->__data_len; -} - -static inline int blk_rq_cur_bytes(const struct request *rq) -{ - return rq->bio ? bio_cur_bytes(rq->bio) : 0; -} - -extern unsigned int blk_rq_err_bytes(const struct request *rq); - -static inline unsigned int blk_rq_sectors(const struct request *rq) -{ - return blk_rq_bytes(rq) >> SECTOR_SHIFT; -} - -static inline unsigned int blk_rq_cur_sectors(const struct request *rq) -{ - return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; -} - -static inline unsigned int blk_rq_stats_sectors(const struct request *rq) -{ - return rq->stats_sectors; -} - #ifdef CONFIG_BLK_DEV_ZONED /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ @@ -924,42 +605,8 @@ static inline unsigned int bio_zone_is_seq(struct bio *bio) return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev), bio->bi_iter.bi_sector); } - -static inline unsigned int blk_rq_zone_no(struct request *rq) -{ - return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); -} - -static inline unsigned int blk_rq_zone_is_seq(struct request *rq) -{ - return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); -} #endif /* CONFIG_BLK_DEV_ZONED */ -/* - * Some commands like WRITE SAME have a payload or data transfer size which - * is different from the size of the request. Any driver that supports such - * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to - * calculate the data transfer size. - */ -static inline unsigned int blk_rq_payload_bytes(struct request *rq) -{ - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - return rq->special_vec.bv_len; - return blk_rq_bytes(rq); -} - -/* - * Return the first full biovec in the request. The caller needs to check that - * there are any bvecs before calling this helper. - */ -static inline struct bio_vec req_bvec(struct request *rq) -{ - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - return rq->special_vec; - return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); -} - static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, int op) { @@ -999,30 +646,6 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q, return min(q->limits.max_sectors, chunk_sectors); } -static inline unsigned int blk_rq_count_bios(struct request *rq) -{ - unsigned int nr_bios = 0; - struct bio *bio; - - __rq_for_each_bio(bio, rq) - nr_bios++; - - return nr_bios; -} - -void blk_steal_bios(struct bio_list *list, struct request *rq); - -/* - * Request completion related functions. - * - * blk_update_request() completes given number of bytes and updates - * the request without completing it. - */ -extern bool blk_update_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes); - -extern void blk_abort_request(struct request *); - /* * Access functions for manipulating queue properties */ @@ -1081,42 +704,6 @@ extern void blk_queue_required_elevator_features(struct request_queue *q, extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, struct device *dev); -/* - * Number of physical segments as sent to the device. - * - * Normally this is the number of discontiguous data segments sent by the - * submitter. But for data-less command like discard we might have no - * actual data segments submitted, but the driver might have to add it's - * own special payload. In that case we still return 1 here so that this - * special payload will be mapped. - */ -static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) -{ - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - return 1; - return rq->nr_phys_segments; -} - -/* - * Number of discard segments (or ranges) the driver needs to fill in. - * Each discard bio merged into a request is counted as one segment. - */ -static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) -{ - return max_t(unsigned short, rq->nr_phys_segments, 1); -} - -int __blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist, struct scatterlist **last_sg); -static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) -{ - struct scatterlist *last_sg = NULL; - - return __blk_rq_map_sg(q, rq, sglist, &last_sg); -} -extern void blk_dump_rq_flags(struct request *, char *); - bool __must_check blk_get_queue(struct request_queue *); extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); @@ -1613,60 +1200,6 @@ extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); -#ifdef CONFIG_BLK_DEV_ZONED -bool blk_req_needs_zone_write_lock(struct request *rq); -bool blk_req_zone_write_trylock(struct request *rq); -void __blk_req_zone_write_lock(struct request *rq); -void __blk_req_zone_write_unlock(struct request *rq); - -static inline void blk_req_zone_write_lock(struct request *rq) -{ - if (blk_req_needs_zone_write_lock(rq)) - __blk_req_zone_write_lock(rq); -} - -static inline void blk_req_zone_write_unlock(struct request *rq) -{ - if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) - __blk_req_zone_write_unlock(rq); -} - -static inline bool blk_req_zone_is_write_locked(struct request *rq) -{ - return rq->q->seq_zones_wlock && - test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); -} - -static inline bool blk_req_can_dispatch_to_zone(struct request *rq) -{ - if (!blk_req_needs_zone_write_lock(rq)) - return true; - return !blk_req_zone_is_write_locked(rq); -} -#else -static inline bool blk_req_needs_zone_write_lock(struct request *rq) -{ - return false; -} - -static inline void blk_req_zone_write_lock(struct request *rq) -{ -} - -static inline void blk_req_zone_write_unlock(struct request *rq) -{ -} -static inline bool blk_req_zone_is_write_locked(struct request *rq) -{ - return false; -} - -static inline bool blk_req_can_dispatch_to_zone(struct request *rq) -{ - return true; -} -#endif /* CONFIG_BLK_DEV_ZONED */ - static inline void blk_wake_io_task(struct task_struct *waiter) { /* diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index a083e15df608..22501a293fa5 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -2,7 +2,7 @@ #ifndef BLKTRACE_H #define BLKTRACE_H -#include +#include #include #include #include diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 96305a64a5a7..c635c2e014e3 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -3,7 +3,7 @@ #define _LINUX_T10_PI_H #include -#include +#include /* * A T10 PI-capable target device can be formatted with different -- cgit v1.2.3 From d2a27964e60ff18e637334864042af54de383902 Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 5 Oct 2021 18:23:27 +0800 Subject: block: Rename BLKDEV_MAX_RQ -> BLKDEV_DEFAULT_RQ It is a bit confusing that there is BLKDEV_MAX_RQ and MAX_SCHED_RQ, as the name BLKDEV_MAX_RQ would imply the max requests always, which it is not. Rename to BLKDEV_MAX_RQ to BLKDEV_DEFAULT_RQ, matching its usage - that being the default number of requests assigned when allocating a request queue. Signed-off-by: John Garry Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/1633429419-228500-3-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index bd4086a6f28e..31cc41dfef6b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -12,7 +12,7 @@ struct blk_mq_tags; struct blk_flush_queue; #define BLKDEV_MIN_RQ 4 -#define BLKDEV_MAX_RQ 128 /* Default maximum */ +#define BLKDEV_DEFAULT_RQ 128 typedef void (rq_end_io_fn)(struct request *, blk_status_t); -- cgit v1.2.3 From e155b0c238b20f0a866f4334d292656665836c8a Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 5 Oct 2021 18:23:37 +0800 Subject: blk-mq: Use shared tags for shared sbitmap support Currently we use separate sbitmap pairs and active_queues atomic_t for shared sbitmap support. However a full sets of static requests are used per HW queue, which is quite wasteful, considering that the total number of requests usable at any given time across all HW queues is limited by the shared sbitmap depth. As such, it is considerably more memory efficient in the case of shared sbitmap to allocate a set of static rqs per tag set or request queue, and not per HW queue. So replace the sbitmap pairs and active_queues atomic_t with a shared tags per tagset and request queue, which will hold a set of shared static rqs. Since there is now no valid HW queue index to be passed to the blk_mq_ops .init and .exit_request callbacks, pass an invalid index token. This changes the semantics of the APIs, such that the callback would need to validate the HW queue index before using it. Currently no user of shared sbitmap actually uses the HW queue index (as would be expected). Signed-off-by: John Garry Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/1633429419-228500-13-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 15 +++++++-------- include/linux/blkdev.h | 3 +-- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 31cc41dfef6b..faa20a19bfcc 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -440,13 +440,11 @@ enum hctx_type { * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. - * @active_queues_shared_sbitmap: - * number of active request queues per tag set. - * @__bitmap_tags: A shared tags sbitmap, used over all hctx's - * @__breserved_tags: - * A shared reserved tags sbitmap, used over all hctx's * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. + * @shared_sbitmap_tags: + * Shared sbitmap set of tags. Has @nr_hw_queues elements. If + * set, shared by all @tags. * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. @@ -463,12 +461,11 @@ struct blk_mq_tag_set { unsigned int timeout; unsigned int flags; void *driver_data; - atomic_t active_queues_shared_sbitmap; - struct sbitmap_queue __bitmap_tags; - struct sbitmap_queue __breserved_tags; struct blk_mq_tags **tags; + struct blk_mq_tags *shared_sbitmap_tags; + struct mutex tag_list_lock; struct list_head tag_list; }; @@ -640,6 +637,8 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) +#define BLK_MQ_NO_HCTX_IDX (-1U) + struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, struct lock_class_key *lkclass); #define blk_mq_alloc_disk(set, queuedata) \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0e960d74615e..cf92c13eb80e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -238,8 +238,7 @@ struct request_queue { atomic_t nr_active_requests_shared_sbitmap; - struct sbitmap_queue sched_bitmap_tags; - struct sbitmap_queue sched_breserved_tags; + struct blk_mq_tags *shared_sbitmap_tags; struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3 From 079a2e3e862548087041a1873bbffceb41a72a33 Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 5 Oct 2021 18:23:39 +0800 Subject: blk-mq: Change shared sbitmap naming to shared tags Now that shared sbitmap support really means shared tags, rename symbols to match that. Signed-off-by: John Garry Link: https://lore.kernel.org/r/1633429419-228500-15-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 8 ++++---- include/linux/blkdev.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index faa20a19bfcc..75d75657df21 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -442,9 +442,9 @@ enum hctx_type { * tag set. * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. - * @shared_sbitmap_tags: - * Shared sbitmap set of tags. Has @nr_hw_queues elements. If - * set, shared by all @tags. + * @shared_tags: + * Shared set of tags. Has @nr_hw_queues elements. If set, + * shared by all @tags. * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. @@ -464,7 +464,7 @@ struct blk_mq_tag_set { struct blk_mq_tags **tags; - struct blk_mq_tags *shared_sbitmap_tags; + struct blk_mq_tags *shared_tags; struct mutex tag_list_lock; struct list_head tag_list; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cf92c13eb80e..b19172db7eef 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -236,9 +236,9 @@ struct request_queue { struct timer_list timeout; struct work_struct timeout_work; - atomic_t nr_active_requests_shared_sbitmap; + atomic_t nr_active_requests_shared_tags; - struct blk_mq_tags *shared_sbitmap_tags; + struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP -- cgit v1.2.3 From ba0ffdd8ce48ad7f7e85191cd29f9674caca3745 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 6 Oct 2021 12:01:07 -0600 Subject: block: bump max plugged deferred size from 16 to 32 Particularly for NVMe with efficient deferred submission for many requests, there are nice benefits to be seen by bumping the default max plug count from 16 to 32. This is especially true for virtualized setups, where the submit part is more expensive. But can be noticed even on native hardware. Reduce the multiple queue factor from 4 to 2, since we're changing the default size. While changing it, move the defines into the block layer private header. These aren't values that anyone outside of the block layer uses, or should use. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b19172db7eef..472b4ab007c6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -727,8 +727,6 @@ struct blk_plug { bool multiple_queues; bool nowait; }; -#define BLK_MAX_REQUEST_COUNT 16 -#define BLK_PLUG_FLUSH_SIZE (128 * 1024) struct blk_plug_cb; typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); -- cgit v1.2.3 From 47c122e35d7e43b14129ceb9ed3a7e67599978fa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 6 Oct 2021 06:34:11 -0600 Subject: block: pre-allocate requests if plug is started and is a batch The caller typically has a good (or even exact) idea of how many requests it needs to submit. We can make the request/tag allocation a lot more efficient if we just allocate N requests/tags upfront when we queue the first bio from the batch. Provide a new plug start helper that allows the caller to specify how many IOs are expected. This sets plug->nr_ios, and we can use that for smarter request allocation. The plug provides a holding spot for requests, and request allocation will check it before calling into the normal request allocation path. The blk_finish_plug() is called, check if there are unused requests and free them. This should not happen in normal operations. The exception is if we get merging, then we may be left with requests that need freeing when done. This raises the per-core performance on my setup from ~5.8M to ~6.1M IOPS. Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 5 ++++- include/linux/blkdev.h | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 75d75657df21..0e941f217578 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -90,7 +90,10 @@ struct request { struct bio *bio; struct bio *biotail; - struct list_head queuelist; + union { + struct list_head queuelist; + struct request *rq_next; + }; /* * The hash is used inside the scheduler, and killed once the diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 472b4ab007c6..17705c970d7e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -722,10 +722,17 @@ extern void blk_set_queue_dying(struct request_queue *); */ struct blk_plug { struct list_head mq_list; /* blk-mq requests */ - struct list_head cb_list; /* md requires an unplug callback */ + + /* if ios_left is > 1, we can batch tag/rq allocations */ + struct request *cached_rq; + unsigned short nr_ios; + unsigned short rq_count; + bool multiple_queues; bool nowait; + + struct list_head cb_list; /* md requires an unplug callback */ }; struct blk_plug_cb; @@ -738,6 +745,7 @@ struct blk_plug_cb { extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size); extern void blk_start_plug(struct blk_plug *); +extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short); extern void blk_finish_plug(struct blk_plug *); extern void blk_flush_plug_list(struct blk_plug *, bool); @@ -772,6 +780,11 @@ long nr_blockdev_pages(void); struct blk_plug { }; +static inline void blk_start_plug_nr_ios(struct blk_plug *plug, + unsigned short nr_ios) +{ +} + static inline void blk_start_plug(struct blk_plug *plug) { } -- cgit v1.2.3 From 84b8514b46b417e299746b1297d3d0899e8539f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 12:44:49 +0200 Subject: block: move the *blkdev_ioctl declarations out of blkdev.h These are only used inside of block/. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211012104450.659013-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 0b48a0cf4262..cd4038fd5743 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -281,10 +281,6 @@ bool bdev_check_media_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); void set_capacity(struct gendisk *disk, sector_t size); -/* for drivers/char/raw.c: */ -int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); -long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); - #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); -- cgit v1.2.3 From 9e8c0d0d4d21d2c2c60132e2c18a73d08a230b42 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 18:17:57 +0200 Subject: block: remove BIO_BUG_ON BIO_DEBUG is always defined, so just switch the two instances to use BUG_ON directly. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211012161804.991559-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 00952e92eae1..65a356fa7110 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -11,14 +11,6 @@ #include #include -#define BIO_DEBUG - -#ifdef BIO_DEBUG -#define BIO_BUG_ON BUG_ON -#else -#define BIO_BUG_ON -#endif - #define BIO_MAX_VECS 256U static inline unsigned int bio_max_segs(unsigned int nr_segs) -- cgit v1.2.3 From 11d9cab1ca6ed08747c6c5a274c6a8e8307aefbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 18:17:58 +0200 Subject: block: don't include in bio.h doesn't need any of the definitions from ioprio.h. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211012161804.991559-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 65a356fa7110..ae94794d07c9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -6,7 +6,6 @@ #define __LINUX_BIO_H #include -#include /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include #include -- cgit v1.2.3 From 8addffd657a956944c1b8a74a1c9aabcfc5b4530 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 18:17:59 +0200 Subject: block: move bio_mergeable out of bio.h bio_mergeable is only needed by I/O schedulers, so move it to blk-mq-sched.h. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211012161804.991559-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index ae94794d07c9..ec255f282994 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -69,14 +69,6 @@ static inline bool bio_no_advance_iter(const struct bio *bio) bio_op(bio) == REQ_OP_WRITE_ZEROES; } -static inline bool bio_mergeable(struct bio *bio) -{ - if (bio->bi_opf & REQ_NOMERGE_FLAGS) - return false; - - return true; -} - static inline unsigned int bio_cur_bytes(struct bio *bio) { if (bio_has_data(bio)) -- cgit v1.2.3 From b6559d8f9fdd7f0e139161cffea2645bd8d084c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 18:18:00 +0200 Subject: block: fold bio_cur_bytes into blk_rq_cur_bytes Fold bio_cur_bytes into the only caller. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211012161804.991559-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 8 -------- include/linux/blk-mq.h | 6 +++++- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index ec255f282994..cffd5eba401e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -69,14 +69,6 @@ static inline bool bio_no_advance_iter(const struct bio *bio) bio_op(bio) == REQ_OP_WRITE_ZEROES; } -static inline unsigned int bio_cur_bytes(struct bio *bio) -{ - if (bio_has_data(bio)) - return bio_iovec(bio).bv_len; - else /* dataless requests such as discard */ - return bio->bi_iter.bi_size; -} - static inline void *bio_data(struct bio *bio) { if (bio_has_data(bio)) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0e941f217578..2219e9277118 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -927,7 +927,11 @@ static inline unsigned int blk_rq_bytes(const struct request *rq) static inline int blk_rq_cur_bytes(const struct request *rq) { - return rq->bio ? bio_cur_bytes(rq->bio) : 0; + if (!rq->bio) + return 0; + if (!bio_has_data(rq->bio)) /* dataless requests such as discard */ + return rq->bio->bi_iter.bi_size; + return bio_iovec(rq->bio).bv_len; } unsigned int blk_rq_err_bytes(const struct request *rq); -- cgit v1.2.3 From 9a6083becbe113ed1e28059ce659dc8ae71b33c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 18:18:01 +0200 Subject: block: move bio_full out of bio.h bio_full is only used in bio.c, so move it there. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211012161804.991559-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index cffd5eba401e..2ffc7c768091 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -77,25 +77,6 @@ static inline void *bio_data(struct bio *bio) return NULL; } -/** - * bio_full - check if the bio is full - * @bio: bio to check - * @len: length of one segment to be added - * - * Return true if @bio is full and one segment with @len bytes can't be - * added to the bio, otherwise return false - */ -static inline bool bio_full(struct bio *bio, unsigned len) -{ - if (bio->bi_vcnt >= bio->bi_max_vecs) - return true; - - if (bio->bi_iter.bi_size > UINT_MAX - len) - return true; - - return false; -} - static inline bool bio_next_segment(const struct bio *bio, struct bvec_iter_all *iter) { -- cgit v1.2.3 From 9774b39175fe3606ce2a836a47134c53b75681c9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 18:18:02 +0200 Subject: block: mark __bio_try_merge_page static Mark __bio_try_merge_page static and move it up a bit to avoid the need for a forward declaration. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211012161804.991559-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 2ffc7c768091..faa0a2fabaf0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -425,8 +425,6 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); int bio_add_zone_append_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); -bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool *same_page); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); -- cgit v1.2.3 From ff18d77b5f0cf3e25aa13741eed4d788a13c04d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 18:18:03 +0200 Subject: block: move bio_get_{first,last}_bvec out of bio.h bio_get_first_bvec and bio_get_last_bvec are only used in blk-merge.c, so move them there. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211012161804.991559-8-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 31 ------------------------------- 1 file changed, 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index faa0a2fabaf0..4c5106f2ed57 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -221,37 +221,6 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit) bio->bi_flags &= ~(1U << bit); } -static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv) -{ - *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); -} - -static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) -{ - struct bvec_iter iter = bio->bi_iter; - int idx; - - bio_get_first_bvec(bio, bv); - if (bv->bv_len == bio->bi_iter.bi_size) - return; /* this bio only has a single bvec */ - - bio_advance_iter(bio, &iter, iter.bi_size); - - if (!iter.bi_bvec_done) - idx = iter.bi_idx - 1; - else /* in the middle of bvec */ - idx = iter.bi_idx; - - *bv = bio->bi_io_vec[idx]; - - /* - * iter.bi_bvec_done records actual length of the last bvec - * if this bio ends in the middle of one io vector - */ - if (iter.bi_bvec_done) - bv->bv_len = iter.bi_bvec_done; -} - static inline struct bio_vec *bio_first_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); -- cgit v1.2.3 From 4f7ab09a1ca0bc955635705c359ab3021b405fd0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 18:18:04 +0200 Subject: block: mark bio_truncate static bio_truncate is only used in bio.c, so mark it static. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211012161804.991559-9-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 4c5106f2ed57..d8d27742a75f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -405,7 +405,6 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); -void bio_truncate(struct bio *bio, unsigned new_size); void guard_bio_eod(struct bio *bio); void zero_fill_bio(struct bio *bio); -- cgit v1.2.3 From 9672b0d43782047b1825a96bafee1b6aefa35bc2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 9 Oct 2021 13:02:23 -0600 Subject: sbitmap: add __sbitmap_queue_get_batch() The block layer tag allocation batching still calls into sbitmap to get each tag, but we can improve on that. Add __sbitmap_queue_get_batch(), which returns a mask of tags all at once, along with an offset for those tags. An example return would be 0xff, where bits 0..7 are set, with tag_offset == 128. The valid tags in this case would be 128..135. A batch is specific to an individual sbitmap_map, hence it cannot be larger than that. The requested number of tags is automatically reduced to the max that can be satisfied with a single map. On failure, 0 is returned. Caller should fall back to single tag allocation at that point/ Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 2713e689ad66..e30b56023ead 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -426,6 +426,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); */ int __sbitmap_queue_get(struct sbitmap_queue *sbq); +/** + * __sbitmap_queue_get_batch() - Try to allocate a batch of free bits + * @sbq: Bitmap queue to allocate from. + * @nr_tags: number of tags requested + * @offset: offset to add to returned bits + * + * Return: Mask of allocated tags, 0 if none are found. Each tag allocated is + * a bit in the mask returned, and the caller must add @offset to the value to + * get the absolute tag value. + */ +unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, + unsigned int *offset); + /** * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct * sbitmap_queue, limiting the depth used from each word, with preemption -- cgit v1.2.3 From f70299f0d58e0e21f7f5f5ab27e601e8d3f0373e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:15 +0200 Subject: blk-mq: factor out a blk_qc_to_hctx helper Add a helper to get the hctx from a request_queue and cookie, and fold the blk_qc_t_to_queue_num helper into it as no other callers are left. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3b967053e9f5..000351c5312a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -505,11 +505,6 @@ static inline bool blk_qc_t_valid(blk_qc_t cookie) return cookie != BLK_QC_T_NONE; } -static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) -{ - return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT; -} - static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) { return cookie & ((1u << BLK_QC_T_SHIFT) - 1); -- cgit v1.2.3 From efbabbe121f96d4b1a98a2c2ef5d2e8f7fb41c87 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:17 +0200 Subject: blk-mq: remove blk_qc_t_to_tag and blk_qc_t_is_internal Merge both functions into their only caller to keep the blk-mq tag to blk_qc_t mapping as private as possible in blk-mq.c. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-8-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 000351c5312a..fb7c1477617b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -505,16 +505,6 @@ static inline bool blk_qc_t_valid(blk_qc_t cookie) return cookie != BLK_QC_T_NONE; } -static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) -{ - return cookie & ((1u << BLK_QC_T_SHIFT) - 1); -} - -static inline bool blk_qc_t_is_internal(blk_qc_t cookie) -{ - return (cookie & BLK_QC_T_INTERNAL) != 0; -} - struct blk_rq_stat { u64 mean; u64 min; -- cgit v1.2.3 From 28a1ae6b9daba6ac65700eeb38479bd6fadec089 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:18 +0200 Subject: blk-mq: remove blk_qc_t_valid Move the trivial check into the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-9-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index fb7c1477617b..5017ba8fc539 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -500,11 +500,6 @@ typedef unsigned int blk_qc_t; #define BLK_QC_T_SHIFT 16 #define BLK_QC_T_INTERNAL (1U << 31) -static inline bool blk_qc_t_valid(blk_qc_t cookie) -{ - return cookie != BLK_QC_T_NONE; -} - struct blk_rq_stat { u64 mean; u64 min; -- cgit v1.2.3 From ef99b2d37666b7a600baab9e1c4944436652b0a2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:19 +0200 Subject: block: replace the spin argument to blk_iopoll with a flags argument Switch the boolean spin argument to blk_poll to passing a set of flags instead. This will allow to control polling behavior in a more fine grained way. Signed-off-by: Christoph Hellwig Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-10-hch@lst.de [axboe: adapt to changed io_uring iopoll] Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 +++- include/linux/fs.h | 2 +- include/linux/iomap.h | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 17705c970d7e..e177346bc020 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -564,7 +564,9 @@ extern const char *blk_op_str(unsigned int op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); -int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); +/* only poll the hardware once, don't continue until a completion was found */ +#define BLK_POLL_ONESHOT (1 << 0) +int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { diff --git a/include/linux/fs.h b/include/linux/fs.h index e7a633353fd2..c443cddf414f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2075,7 +2075,7 @@ struct file_operations { ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); - int (*iopoll)(struct kiocb *kiocb, bool spin); + int (*iopoll)(struct kiocb *kiocb, unsigned int flags); int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 24f8489583ca..1e86b65567c2 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -337,7 +337,7 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags); ssize_t iomap_dio_complete(struct iomap_dio *dio); -int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); +int iomap_dio_iopoll(struct kiocb *kiocb, unsigned int flags); #ifdef CONFIG_SWAP struct file; -- cgit v1.2.3 From d729cf9acb9311956c8a37113dcfa0160a2d9665 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:20 +0200 Subject: io_uring: don't sleep when polling for I/O There is no point in sleeping for the expected I/O completion timeout in the io_uring async polling model as we never poll for a specific I/O. Signed-off-by: Christoph Hellwig Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e177346bc020..2b80c98fc373 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -566,6 +566,8 @@ blk_status_t errno_to_blk_status(int errno); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) +/* do not sleep to wait for the expected completion time */ +#define BLK_POLL_NOSLEEP (1 << 1) int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) -- cgit v1.2.3 From 6ce913fe3eee14f40f778e85999c9e599dda8c6b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:21 +0200 Subject: block: rename REQ_HIPRI to REQ_POLLED Unlike the RWF_HIPRI userspace ABI which is intentionally kept vague, the bio flag is specific to the polling implementation, so rename and document it properly. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-12-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- include/linux/blk_types.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index d8d27742a75f..c7a2d880e927 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -706,7 +706,7 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, */ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) { - bio->bi_opf |= REQ_HIPRI; + bio->bi_opf |= REQ_POLLED; if (!is_sync_kiocb(kiocb)) bio->bi_opf |= REQ_NOWAIT; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 5017ba8fc539..f8b9fce68834 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -384,7 +384,7 @@ enum req_flag_bits { /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ - __REQ_HIPRI, + __REQ_POLLED, /* caller polls for completion using blk_poll */ /* for driver use */ __REQ_DRV, @@ -409,7 +409,7 @@ enum req_flag_bits { #define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) -#define REQ_HIPRI (1ULL << __REQ_HIPRI) +#define REQ_POLLED (1ULL << __REQ_POLLED) #define REQ_DRV (1ULL << __REQ_DRV) #define REQ_SWAP (1ULL << __REQ_SWAP) -- cgit v1.2.3 From 19416123ab3e1348b3532347af221d8f60838431 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 12 Oct 2021 13:12:23 +0200 Subject: block: define 'struct bvec_iter' as packed 'struct bvec_iter' is embedded into 'struct bio', define it as packed so that we can get one extra 4bytes for other uses without expanding bio. 'struct bvec_iter' is often allocated on stack, so making it packed doesn't affect performance. Also I have run io_uring on both nvme/null_blk, and not observe performance effect in this way. Suggested-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-14-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 0e9bdd42dafb..35c25dff651a 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -44,7 +44,7 @@ struct bvec_iter { unsigned int bi_bvec_done; /* number of bytes completed in current bvec */ -}; +} __packed; struct bvec_iter_all { struct bio_vec bv; -- cgit v1.2.3 From 3e08773c3841e9db7a520908cc2b136a77d275ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:24 +0200 Subject: block: switch polling to be bio based Replace the blk_poll interface that requires the caller to keep a queue and cookie from the submissions with polling based on the bio. Polling for the bio itself leads to a few advantages: - the cookie construction can made entirely private in blk-mq.c - the caller does not need to remember the request_queue and cookie separately and thus sidesteps their lifetime issues - keeping the device and the cookie inside the bio allows to trivially support polling BIOs remapping by stacking drivers - a lot of code to propagate the cookie back up the submission path can be removed entirely. Signed-off-by: Christoph Hellwig Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-15-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- include/linux/blk-mq.h | 15 ++------------- include/linux/blk_types.h | 12 +++++------- include/linux/blkdev.h | 8 +++++--- include/linux/fs.h | 6 +----- include/linux/iomap.h | 5 ++--- 6 files changed, 16 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index c7a2d880e927..62d684b7dd4c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -349,7 +349,7 @@ static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } -extern blk_qc_t submit_bio(struct bio *); +void submit_bio(struct bio *bio); extern void bio_endio(struct bio *); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2219e9277118..a9c1d0882550 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -359,9 +359,9 @@ struct blk_mq_hw_ctx { /** @kobj: Kernel object for sysfs. */ struct kobject kobj; - /** @poll_considered: Count times blk_poll() was called. */ + /** @poll_considered: Count times blk_mq_poll() was called. */ unsigned long poll_considered; - /** @poll_invoked: Count how many requests blk_poll() polled. */ + /** @poll_invoked: Count how many requests blk_mq_poll() polled. */ unsigned long poll_invoked; /** @poll_success: Count how many polled requests were completed. */ unsigned long poll_success; @@ -815,16 +815,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) for ((i) = 0; (i) < (hctx)->nr_ctx && \ ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) -static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - if (rq->tag != -1) - return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); - - return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) | - BLK_QC_T_INTERNAL; -} - static inline void blk_mq_cleanup_rq(struct request *rq) { if (rq->q->mq_ops->cleanup_rq) @@ -843,7 +833,6 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->rq_disk = bio->bi_bdev->bd_disk; } -blk_qc_t blk_mq_submit_bio(struct bio *bio); void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f8b9fce68834..72736b4c057c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -208,6 +208,9 @@ static inline void bio_issue_init(struct bio_issue *issue, ((u64)size << BIO_ISSUE_SIZE_SHIFT)); } +typedef unsigned int blk_qc_t; +#define BLK_QC_T_NONE -1U + /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) @@ -227,8 +230,8 @@ struct bio { struct bvec_iter bi_iter; + blk_qc_t bi_cookie; bio_end_io_t *bi_end_io; - void *bi_private; #ifdef CONFIG_BLK_CGROUP /* @@ -384,7 +387,7 @@ enum req_flag_bits { /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ - __REQ_POLLED, /* caller polls for completion using blk_poll */ + __REQ_POLLED, /* caller polls for completion using bio_poll */ /* for driver use */ __REQ_DRV, @@ -495,11 +498,6 @@ static inline int op_stat_group(unsigned int op) return op_is_write(op); } -typedef unsigned int blk_qc_t; -#define BLK_QC_T_NONE -1U -#define BLK_QC_T_SHIFT 16 -#define BLK_QC_T_INTERNAL (1U << 31) - struct blk_rq_stat { u64 mean; u64 min; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2b80c98fc373..2a8689e949b4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -25,6 +25,7 @@ struct request; struct sg_io_hdr; struct blkcg_gq; struct blk_flush_queue; +struct kiocb; struct pr_ops; struct rq_qos; struct blk_queue_stats; @@ -550,7 +551,7 @@ static inline unsigned int blk_queue_depth(struct request_queue *q) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); -blk_qc_t submit_bio_noacct(struct bio *bio); +void submit_bio_noacct(struct bio *bio); extern int blk_lld_busy(struct request_queue *q); extern void blk_queue_split(struct bio **); @@ -568,7 +569,8 @@ blk_status_t errno_to_blk_status(int errno); #define BLK_POLL_ONESHOT (1 << 0) /* do not sleep to wait for the expected completion time */ #define BLK_POLL_NOSLEEP (1 << 1) -int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags); +int bio_poll(struct bio *bio, unsigned int flags); +int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { @@ -1176,7 +1178,7 @@ static inline void blk_ksm_unregister(struct request_queue *q) { } struct block_device_operations { - blk_qc_t (*submit_bio) (struct bio *bio); + void (*submit_bio)(struct bio *bio); int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int); diff --git a/include/linux/fs.h b/include/linux/fs.h index c443cddf414f..f595f4097cb7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -334,11 +334,7 @@ struct kiocb { int ki_flags; u16 ki_hint; u16 ki_ioprio; /* See linux/ioprio.h */ - union { - unsigned int ki_cookie; /* for ->iopoll */ - struct wait_page_queue *ki_waitq; /* for async buffered IO */ - }; - + struct wait_page_queue *ki_waitq; /* for async buffered IO */ randomized_struct_fields_end }; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 1e86b65567c2..63f4ea4dac9b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -313,8 +313,8 @@ int iomap_writepages(struct address_space *mapping, struct iomap_dio_ops { int (*end_io)(struct kiocb *iocb, ssize_t size, int error, unsigned flags); - blk_qc_t (*submit_io)(const struct iomap_iter *iter, struct bio *bio, - loff_t file_offset); + void (*submit_io)(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset); }; /* @@ -337,7 +337,6 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags); ssize_t iomap_dio_complete(struct iomap_dio *dio); -int iomap_dio_iopoll(struct kiocb *kiocb, unsigned int flags); #ifdef CONFIG_SWAP struct file; -- cgit v1.2.3 From 17220ca5ce9606c1b015c4316fca18734c2df0bb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 14 Oct 2021 15:03:26 +0100 Subject: block: cache request queue in bdev There are tons of places where we need to get a request_queue only having bdev, which turns into bdev->bd_disk->queue. There are probably a hundred of such places considering inline helpers, and enough of them are in hot paths. Cache queue pointer in struct block_device and make use of it in bdev_get_queue(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a3bfaecdd28956f03629d0ca5c63ebc096e1c809.1634219547.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 + include/linux/blkdev.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 72736b4c057c..1e370929c89e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -38,6 +38,7 @@ struct block_device { u8 bd_partno; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; + struct request_queue * bd_queue; /* The counter of freeze processes */ int bd_fsfreeze_count; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2a8689e949b4..d5b21fc8f49e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -574,7 +574,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { - return bdev->bd_disk->queue; /* this is never NULL */ + return bdev->bd_queue; /* this is never NULL */ } /* -- cgit v1.2.3 From b5f9c644eb1baafcd349ad134e2110773f8d0a38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Oct 2021 14:59:35 +0200 Subject: PCI: Remove struct pci_dev->driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no remaining uses of the struct pci_dev->driver pointer, so remove it. Link: https://lore.kernel.org/r/20211004125935.2300113-12-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index a2d62165a7f7..03bfdb25a55c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -342,7 +342,6 @@ struct pci_dev { u16 pcie_flags_reg; /* Cached PCIe Capabilities Register */ unsigned long *dma_alias_mask;/* Mask of enabled devfn aliases */ - struct pci_driver *driver; /* Driver bound to this device */ u64 dma_mask; /* Mask of the bits of bus address this device implements. Normally this is 0xffffffff. You only need to change -- cgit v1.2.3 From bb523b406c849eef8f265a07cd7f320f1f177743 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 2 Aug 2021 13:44:20 +0200 Subject: gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable} Turn fault_in_pages_{readable,writeable} into versions that return the number of bytes not faulted in, similar to copy_to_user, instead of returning a non-zero value when any of the requested pages couldn't be faulted in. This supports the existing users that require all pages to be faulted in as well as new users that are happy if any pages can be faulted in. Rename the functions to fault_in_{readable,writeable} to make sure this change doesn't silently break things. Neither of these functions is entirely trivial and it doesn't seem useful to inline them, so move them to mm/gup.c. Signed-off-by: Andreas Gruenbacher --- include/linux/pagemap.h | 57 +++---------------------------------------------- 1 file changed, 3 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 62db6b0176b9..9fe94f7a4f7e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -733,61 +733,10 @@ int wait_on_page_private_2_killable(struct page *page); extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); /* - * Fault everything in given userspace address range in. + * Fault in userspace address range. */ -static inline int fault_in_pages_writeable(char __user *uaddr, size_t size) -{ - char __user *end = uaddr + size - 1; - - if (unlikely(size == 0)) - return 0; - - if (unlikely(uaddr > end)) - return -EFAULT; - /* - * Writing zeroes into userspace here is OK, because we know that if - * the zero gets there, we'll be overwriting it. - */ - do { - if (unlikely(__put_user(0, uaddr) != 0)) - return -EFAULT; - uaddr += PAGE_SIZE; - } while (uaddr <= end); - - /* Check whether the range spilled into the next page. */ - if (((unsigned long)uaddr & PAGE_MASK) == - ((unsigned long)end & PAGE_MASK)) - return __put_user(0, end); - - return 0; -} - -static inline int fault_in_pages_readable(const char __user *uaddr, size_t size) -{ - volatile char c; - const char __user *end = uaddr + size - 1; - - if (unlikely(size == 0)) - return 0; - - if (unlikely(uaddr > end)) - return -EFAULT; - - do { - if (unlikely(__get_user(c, uaddr) != 0)) - return -EFAULT; - uaddr += PAGE_SIZE; - } while (uaddr <= end); - - /* Check whether the range spilled into the next page. */ - if (((unsigned long)uaddr & PAGE_MASK) == - ((unsigned long)end & PAGE_MASK)) { - return __get_user(c, end); - } - - (void)c; - return 0; -} +size_t fault_in_writeable(char __user *uaddr, size_t size); +size_t fault_in_readable(const char __user *uaddr, size_t size); int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); -- cgit v1.2.3 From a6294593e8a1290091d0b078d5d33da5e0cd3dfe Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 2 Aug 2021 14:54:16 +0200 Subject: iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable Turn iov_iter_fault_in_readable into a function that returns the number of bytes not faulted in, similar to copy_to_user, instead of returning a non-zero value when any of the requested pages couldn't be faulted in. This supports the existing users that require all pages to be faulted in as well as new users that are happy if any pages can be faulted in. Rename iov_iter_fault_in_readable to fault_in_iov_iter_readable to make sure this change doesn't silently break things. Signed-off-by: Andreas Gruenbacher --- include/linux/uio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 207101a9c5c3..d18458af6681 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -133,7 +133,7 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); -int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes); +size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); -- cgit v1.2.3 From d4aa57a1cac3c99ffd641f7c8e0a7aff5656de0d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Oct 2021 09:01:43 -0600 Subject: block: don't bother iter advancing a fully done bio If we're completing nbytes and nbytes is the size of the bio, don't bother with calling into the iterator increment helpers. Just clear the bio size and we're done. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 62d684b7dd4c..9538f20ffaa5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -119,6 +119,28 @@ static inline void bio_advance_iter_single(const struct bio *bio, bvec_iter_advance_single(bio->bi_io_vec, iter, bytes); } +void __bio_advance(struct bio *, unsigned bytes); + +/** + * bio_advance - increment/complete a bio by some number of bytes + * @bio: bio to advance + * @bytes: number of bytes to complete + * + * This updates bi_sector, bi_size and bi_idx; if the number of bytes to + * complete doesn't align with a bvec boundary, then bv_len and bv_offset will + * be updated on the last bvec as well. + * + * @bio will then represent the remaining, uncompleted portion of the io. + */ +static inline void bio_advance(struct bio *bio, unsigned int nbytes) +{ + if (nbytes == bio->bi_iter.bi_size) { + bio->bi_iter.bi_size = 0; + return; + } + __bio_advance(bio, nbytes); +} + #define __bio_for_each_segment(bvl, bio, iter, start) \ for (iter = (start); \ (iter).bi_size && \ @@ -381,8 +403,6 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) struct request_queue; extern int submit_bio_wait(struct bio *bio); -extern void bio_advance(struct bio *, unsigned); - extern void bio_init(struct bio *bio, struct bio_vec *table, unsigned short max_vecs); extern void bio_uninit(struct bio *); -- cgit v1.2.3 From b60876296847e6cd7f1da4b8b7f0f31399d59aa1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Oct 2021 15:03:52 -0600 Subject: block: improve layout of struct request It's been a while since this was analyzed, move some members around to better flow with the use case. Initial state up top, and queued state after that. This improves my peak case by about 1.5%, from 7750K to 7900K IOPS. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 90 ++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a9c1d0882550..8ca9728cc7f2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -83,6 +83,8 @@ struct request { int tag; int internal_tag; + unsigned int timeout; + /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ sector_t __sector; /* sector cursor */ @@ -95,49 +97,6 @@ struct request { struct request *rq_next; }; - /* - * The hash is used inside the scheduler, and killed once the - * request reaches the dispatch list. The ipi_list is only used - * to queue the request for softirq completion, which is long - * after the request has been unhashed (and even removed from - * the dispatch list). - */ - union { - struct hlist_node hash; /* merge hash */ - struct llist_node ipi_list; - }; - - /* - * The rb_node is only used inside the io scheduler, requests - * are pruned when moved to the dispatch queue. So let the - * completion_data share space with the rb_node. - */ - union { - struct rb_node rb_node; /* sort/lookup */ - struct bio_vec special_vec; - void *completion_data; - int error_count; /* for legacy drivers, don't use */ - }; - - /* - * Three pointers are available for the IO schedulers, if they need - * more they have to dynamically allocate it. Flush requests are - * never put on the IO scheduler. So let the flush fields share - * space with the elevator data. - */ - union { - struct { - struct io_cq *icq; - void *priv[2]; - } elv; - - struct { - unsigned int seq; - struct list_head list; - rq_end_io_fn *saved_end_io; - } flush; - }; - struct gendisk *rq_disk; struct block_device *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME @@ -180,9 +139,52 @@ struct request { enum mq_rq_state state; refcount_t ref; - unsigned int timeout; unsigned long deadline; + /* + * The hash is used inside the scheduler, and killed once the + * request reaches the dispatch list. The ipi_list is only used + * to queue the request for softirq completion, which is long + * after the request has been unhashed (and even removed from + * the dispatch list). + */ + union { + struct hlist_node hash; /* merge hash */ + struct llist_node ipi_list; + }; + + /* + * The rb_node is only used inside the io scheduler, requests + * are pruned when moved to the dispatch queue. So let the + * completion_data share space with the rb_node. + */ + union { + struct rb_node rb_node; /* sort/lookup */ + struct bio_vec special_vec; + void *completion_data; + int error_count; /* for legacy drivers, don't use */ + }; + + + /* + * Three pointers are available for the IO schedulers, if they need + * more they have to dynamically allocate it. Flush requests are + * never put on the IO scheduler. So let the flush fields share + * space with the elevator data. + */ + union { + struct { + struct io_cq *icq; + void *priv[2]; + } elv; + + struct { + unsigned int seq; + struct list_head list; + rq_end_io_fn *saved_end_io; + } flush; + }; + union { struct __call_single_data csd; u64 fifo_time; -- cgit v1.2.3 From 2ff0682da6e09c1e0db63a2d2abcd4efb531c8db Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Oct 2021 09:44:38 -0600 Subject: block: store elevator state in request Add an rq private RQF_ELV flag, which tells the block layer that this request was initialized on a queue that has an IO scheduler attached. This allows for faster checking in the fast path, rather than having to deference rq->q later on. Elevator switching does full quiesce of the queue before detaching an IO scheduler, so it's safe to cache this in the request itself. Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8ca9728cc7f2..95c3bd3a008e 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -55,6 +55,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) /* ->timeout has been called, don't expire again */ #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) +/* queue has elevator attached */ +#define RQF_ELV ((__force req_flags_t)(1 << 22)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ -- cgit v1.2.3 From 0d20e9fb1262b1f9ac895b287db892bc75b05b84 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Mon, 18 Oct 2021 17:19:31 +0200 Subject: rtc: add BSM parameter BSM or Backup Switch Mode is a common feature on RTCs, allowing to select how the RTC will decide when to switch from its primary power supply to the backup power supply. It is necessary to be able to set it from userspace as there are uses cases where it has to be done dynamically. Supported values are: RTC_BSM_DISABLED: disabled RTC_BSM_DIRECT: switching will happen as soon as Vbackup > Vdd RTC_BSM_LEVEL: switching will happen around a threshold, usually with an hysteresis RTC_BSM_STANDBY: switching will not happen until Vdd > Vbackup, this is useful to ensure the RTC doesn't draw any power until the device is first powered on. Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211018151933.76865-6-alexandre.belloni@bootlin.com --- include/linux/rtc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 354e0843ab17..47fd1c2d3a57 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -66,6 +66,8 @@ struct rtc_class_ops { int (*alarm_irq_enable)(struct device *, unsigned int enabled); int (*read_offset)(struct device *, long *offset); int (*set_offset)(struct device *, long offset); + int (*param_get)(struct device *, struct rtc_param *param); + int (*param_set)(struct device *, struct rtc_param *param); }; struct rtc_device; -- cgit v1.2.3 From 88dee3b0efe4b769fb22ce2e963aabae403e6801 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Mon, 18 Oct 2021 20:41:09 +0800 Subject: PCI: Remove unused pci_pool wrappers The pci_pool users have been converted to dma_pool. Remove the unused pci_pool wrappers. Link: https://lore.kernel.org/r/20211018124110.214-1-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index cd8aa6fce204..2ae9dd7f975f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1498,19 +1498,8 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode, #define PCI_IRQ_ALL_TYPES \ (PCI_IRQ_LEGACY | PCI_IRQ_MSI | PCI_IRQ_MSIX) -/* kmem_cache style wrapper around pci_alloc_consistent() */ - #include -#define pci_pool dma_pool -#define pci_pool_create(name, pdev, size, align, allocation) \ - dma_pool_create(name, &pdev->dev, size, align, allocation) -#define pci_pool_destroy(pool) dma_pool_destroy(pool) -#define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle) -#define pci_pool_zalloc(pool, flags, handle) \ - dma_pool_zalloc(pool, flags, handle) -#define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr) - struct msix_entry { u32 vector; /* Kernel uses to write allocated vector */ u16 entry; /* Driver uses to specify entry, OS writes */ -- cgit v1.2.3 From 4797632f4f1d8af4e0670adcb97bf9800dc3beca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 17 May 2021 20:16:57 -0700 Subject: string.h: Introduce memset_after() for wiping trailing members/padding A common idiom in kernel code is to wipe the contents of a structure after a given member. This is especially useful in places where there is trailing padding. These open-coded cases are usually difficult to read and very sensitive to struct layout changes. Introduce a new helper, memset_after() that takes the target struct instance, the byte to write, and the member name after which the zeroing should start. Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Andrew Morton Cc: Francis Laniel Cc: Vincenzo Frascino Cc: Daniel Axtens Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook --- include/linux/string.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index ac1c769a5a80..da490c2154a9 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -271,6 +271,23 @@ static inline void memcpy_and_pad(void *dest, size_t dest_len, memcpy(dest, src, dest_len); } +/** + * memset_after - Set a value after a struct member to the end of a struct + * + * @obj: Address of target struct instance + * @v: Byte value to repeatedly write + * @member: after which struct member to start writing bytes + * + * This is good for clearing padding following the given member. + */ +#define memset_after(obj, v, member) \ +({ \ + u8 *__ptr = (u8 *)(obj); \ + typeof(v) __val = (v); \ + memset(__ptr + offsetofend(typeof(*(obj)), member), __val, \ + sizeof(*(obj)) - offsetofend(typeof(*(obj)), member)); \ +}) + /** * str_has_prefix - Test if a string has a given prefix * @str: The string to test -- cgit v1.2.3 From 6dbefad40815a61aecbcf9b552e87ef57ab8cc7d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 17 May 2021 20:16:57 -0700 Subject: string.h: Introduce memset_startat() for wiping trailing members and padding A common idiom in kernel code is to wipe the contents of a structure starting from a given member. These open-coded cases are usually difficult to read and very sensitive to struct layout changes. Like memset_after(), introduce a new helper, memset_startat() that takes the target struct instance, the byte to write, and the member name where zeroing should start. Note that this doesn't zero padding preceding the target member. For those cases, memset_after() should be used on the preceding member. Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Andrew Morton Cc: Francis Laniel Cc: Vincenzo Frascino Cc: Daniel Axtens Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook --- include/linux/string.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index da490c2154a9..5a36608144a9 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -288,6 +288,24 @@ static inline void memcpy_and_pad(void *dest, size_t dest_len, sizeof(*(obj)) - offsetofend(typeof(*(obj)), member)); \ }) +/** + * memset_startat - Set a value starting at a member to the end of a struct + * + * @obj: Address of target struct instance + * @v: Byte value to repeatedly write + * @member: struct member to start writing at + * + * Note that if there is padding between the prior member and the target + * member, memset_after() should be used to clear the prior padding. + */ +#define memset_startat(obj, v, member) \ +({ \ + u8 *__ptr = (u8 *)(obj); \ + typeof(v) __val = (v); \ + memset(__ptr + offsetof(typeof(*(obj)), member), __val, \ + sizeof(*(obj)) - offsetof(typeof(*(obj)), member)); \ +}) + /** * str_has_prefix - Test if a string has a given prefix * @str: The string to test -- cgit v1.2.3 From 3080ea5553cc909b000d1f1d964a9041962f2c5b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 Aug 2021 11:21:23 -0700 Subject: stddef: Introduce DECLARE_FLEX_ARRAY() helper There are many places where kernel code wants to have several different typed trailing flexible arrays. This would normally be done with multiple flexible arrays in a union, but since GCC and Clang don't (on the surface) allow this, there have been many open-coded workarounds, usually involving neighboring 0-element arrays at the end of a structure. For example, instead of something like this: struct thing { ... union { struct type1 foo[]; struct type2 bar[]; }; }; code works around the compiler with: struct thing { ... struct type1 foo[0]; struct type2 bar[]; }; Another case is when a flexible array is wanted as the single member within a struct (which itself is usually in a union). For example, this would be worked around as: union many { ... struct { struct type3 baz[0]; }; }; These kinds of work-arounds cause problems with size checks against such zero-element arrays (for example when building with -Warray-bounds and -Wzero-length-bounds, and with the coming FORTIFY_SOURCE improvements), so they must all be converted to "real" flexible arrays, avoiding warnings like this: fs/hpfs/anode.c: In function 'hpfs_add_sector_to_btree': fs/hpfs/anode.c:209:27: warning: array subscript 0 is outside the bounds of an interior zero-length array 'struct bplus_internal_node[0]' [-Wzero-length-bounds] 209 | anode->btree.u.internal[0].down = cpu_to_le32(a); | ~~~~~~~~~~~~~~~~~~~~~~~^~~ In file included from fs/hpfs/hpfs_fn.h:26, from fs/hpfs/anode.c:10: fs/hpfs/hpfs.h:412:32: note: while referencing 'internal' 412 | struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving | ^~~~~~~~ drivers/net/can/usb/etas_es58x/es58x_fd.c: In function 'es58x_fd_tx_can_msg': drivers/net/can/usb/etas_es58x/es58x_fd.c:360:35: warning: array subscript 65535 is outside the bounds of an interior zero-length array 'u8[0]' {aka 'unsigned char[]'} [-Wzero-length-bounds] 360 | tx_can_msg = (typeof(tx_can_msg))&es58x_fd_urb_cmd->raw_msg[msg_len]; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from drivers/net/can/usb/etas_es58x/es58x_core.h:22, from drivers/net/can/usb/etas_es58x/es58x_fd.c:17: drivers/net/can/usb/etas_es58x/es58x_fd.h:231:6: note: while referencing 'raw_msg' 231 | u8 raw_msg[0]; | ^~~~~~~ However, it _is_ entirely possible to have one or more flexible arrays in a struct or union: it just has to be in another struct. And since it cannot be alone in a struct, such a struct must have at least 1 other named member -- but that member can be zero sized. Wrap all this nonsense into the new DECLARE_FLEX_ARRAY() in support of having flexible arrays in unions (or alone in a struct). As with struct_group(), since this is needed in UAPI headers as well, implement the core there, with a non-UAPI wrapper. Additionally update kernel-doc to understand its existence. https://github.com/KSPP/linux/issues/137 Cc: Arnd Bergmann Cc: "Gustavo A. R. Silva" Signed-off-by: Kees Cook --- include/linux/stddef.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stddef.h b/include/linux/stddef.h index 8b103a53b000..ca507bd5f808 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -84,4 +84,17 @@ enum { #define struct_group_tagged(TAG, NAME, MEMBERS...) \ __struct_group(TAG, NAME, /* no attrs */, MEMBERS) +/** + * DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union + * + * @TYPE: The type of each flexible array element + * @NAME: The name of the flexible array member + * + * In order to have a flexible array member in a union or alone in a + * struct, it needs to be wrapped in an anonymous struct with at least 1 + * named member, but that member can be empty. + */ +#define DECLARE_FLEX_ARRAY(TYPE, NAME) \ + __DECLARE_FLEX_ARRAY(TYPE, NAME) + #endif -- cgit v1.2.3 From fa7845cfd53f3b1d3f60efa55db89805595bc045 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 9 Aug 2021 11:29:33 -0700 Subject: treewide: Replace open-coded flex arrays in unions In support of enabling -Warray-bounds and -Wzero-length-bounds and correctly handling run-time memcpy() bounds checking, replace all open-coded flexible arrays (i.e. 0-element arrays) in unions with the DECLARE_FLEX_ARRAY() helper macro. This fixes warnings such as: fs/hpfs/anode.c: In function 'hpfs_add_sector_to_btree': fs/hpfs/anode.c:209:27: warning: array subscript 0 is outside the bounds of an interior zero-length array 'struct bplus_internal_node[0]' [-Wzero-length-bounds] 209 | anode->btree.u.internal[0].down = cpu_to_le32(a); | ~~~~~~~~~~~~~~~~~~~~~~~^~~ In file included from fs/hpfs/hpfs_fn.h:26, from fs/hpfs/anode.c:10: fs/hpfs/hpfs.h:412:32: note: while referencing 'internal' 412 | struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving | ^~~~~~~~ drivers/net/can/usb/etas_es58x/es58x_fd.c: In function 'es58x_fd_tx_can_msg': drivers/net/can/usb/etas_es58x/es58x_fd.c:360:35: warning: array subscript 65535 is outside the bounds of an interior zero-length array 'u8[0]' {aka 'unsigned char[]'} [-Wzero-length-bounds] 360 | tx_can_msg = (typeof(tx_can_msg))&es58x_fd_urb_cmd->raw_msg[msg_len]; | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from drivers/net/can/usb/etas_es58x/es58x_core.h:22, from drivers/net/can/usb/etas_es58x/es58x_fd.c:17: drivers/net/can/usb/etas_es58x/es58x_fd.h:231:6: note: while referencing 'raw_msg' 231 | u8 raw_msg[0]; | ^~~~~~~ Cc: "Gustavo A. R. Silva" Cc: Arnd Bergmann Cc: Ayush Sawal Cc: Vinay Kumar Yadav Cc: Rohit Maheshwari Cc: Herbert Xu Cc: "David S. Miller" Cc: Kalle Valo Cc: Jakub Kicinski Cc: Stanislaw Gruszka Cc: Luca Coelho Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: John Fastabend Cc: KP Singh Cc: Johannes Berg Cc: Mordechay Goodstein Cc: Lee Jones Cc: Wolfgang Grandegger Cc: Marc Kleine-Budde Cc: Arunachalam Santhanam Cc: Vincent Mailhol Cc: Mikulas Patocka Cc: linux-crypto@vger.kernel.org Cc: ath10k@lists.infradead.org Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: linux-can@vger.kernel.org Cc: bpf@vger.kernel.org Acked-by: Marc Kleine-Budde # drivers/net/can/usb/etas_es58x/* Signed-off-by: Kees Cook --- include/linux/filter.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4a93c12543ee..4298c5e428a3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -586,8 +586,10 @@ struct bpf_prog { struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ /* Instructions for interpreter */ - struct sock_filter insns[0]; - struct bpf_insn insnsi[]; + union { + DECLARE_FLEX_ARRAY(struct sock_filter, insns); + DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi); + }; }; struct sk_filter { -- cgit v1.2.3 From 47c662486cccf03e7062139d069b07ab0126ef59 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 13 Aug 2021 12:19:24 -0700 Subject: treewide: Replace 0-element memcpy() destinations with flexible arrays The 0-element arrays that are used as memcpy() destinations are actually flexible arrays. Adjust their structures accordingly so that memcpy() can better reason able their destination size (i.e. they need to be seen as "unknown" length rather than "zero"). In some cases, use of the DECLARE_FLEX_ARRAY() helper is needed when a flexible array is alone in a struct. Cc: "Gustavo A. R. Silva" Cc: Arnd Bergmann Cc: Kalle Valo Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Nilesh Javali Cc: Manish Rangankar Cc: GR-QLogic-Storage-Upstream@marvell.com Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: Larry Finger Cc: Phillip Potter Cc: Greg Kroah-Hartman Cc: Florian Schilhabel Cc: Johannes Berg Cc: Christophe JAILLET Cc: Fabio Aiuto Cc: Ross Schmidt Cc: Marco Cesati Cc: ath10k@lists.infradead.org Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: linux-staging@lists.linux.dev Signed-off-by: Kees Cook --- include/linux/ieee80211.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 694264503119..ada3dd79cd08 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1143,7 +1143,7 @@ struct ieee80211_mgmt { __le16 auth_transaction; __le16 status_code; /* possibly followed by Challenge text */ - u8 variable[0]; + u8 variable[]; } __packed auth; struct { __le16 reason_code; @@ -1152,26 +1152,26 @@ struct ieee80211_mgmt { __le16 capab_info; __le16 listen_interval; /* followed by SSID and Supported rates */ - u8 variable[0]; + u8 variable[]; } __packed assoc_req; struct { __le16 capab_info; __le16 status_code; __le16 aid; /* followed by Supported rates */ - u8 variable[0]; + u8 variable[]; } __packed assoc_resp, reassoc_resp; struct { __le16 capab_info; __le16 status_code; - u8 variable[0]; + u8 variable[]; } __packed s1g_assoc_resp, s1g_reassoc_resp; struct { __le16 capab_info; __le16 listen_interval; u8 current_ap[ETH_ALEN]; /* followed by SSID and Supported rates */ - u8 variable[0]; + u8 variable[]; } __packed reassoc_req; struct { __le16 reason_code; @@ -1182,11 +1182,11 @@ struct ieee80211_mgmt { __le16 capab_info; /* followed by some of SSID, Supported rates, * FH Params, DS Params, CF Params, IBSS Params, TIM */ - u8 variable[0]; + u8 variable[]; } __packed beacon; struct { /* only variable items: SSID, Supported rates */ - u8 variable[0]; + DECLARE_FLEX_ARRAY(u8, variable); } __packed probe_req; struct { __le64 timestamp; @@ -1194,7 +1194,7 @@ struct ieee80211_mgmt { __le16 capab_info; /* followed by some of SSID, Supported rates, * FH Params, DS Params, CF Params, IBSS Params */ - u8 variable[0]; + u8 variable[]; } __packed probe_resp; struct { u8 category; @@ -1203,16 +1203,16 @@ struct ieee80211_mgmt { u8 action_code; u8 dialog_token; u8 status_code; - u8 variable[0]; + u8 variable[]; } __packed wme_action; struct{ u8 action_code; - u8 variable[0]; + u8 variable[]; } __packed chan_switch; struct{ u8 action_code; struct ieee80211_ext_chansw_ie data; - u8 variable[0]; + u8 variable[]; } __packed ext_chan_switch; struct{ u8 action_code; @@ -1228,7 +1228,7 @@ struct ieee80211_mgmt { __le16 timeout; __le16 start_seq_num; /* followed by BA Extension */ - u8 variable[0]; + u8 variable[]; } __packed addba_req; struct{ u8 action_code; @@ -1244,11 +1244,11 @@ struct ieee80211_mgmt { } __packed delba; struct { u8 action_code; - u8 variable[0]; + u8 variable[]; } __packed self_prot; struct{ u8 action_code; - u8 variable[0]; + u8 variable[]; } __packed mesh_action; struct { u8 action; @@ -1292,7 +1292,7 @@ struct ieee80211_mgmt { u8 toa[6]; __le16 tod_error; __le16 toa_error; - u8 variable[0]; + u8 variable[]; } __packed ftm; struct { u8 action_code; -- cgit v1.2.3 From afd7de03c5268f74202c1dd4780a8532a11f4c6b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Oct 2021 08:53:19 -0600 Subject: block: remove some blk_mq_hw_ctx debugfs entries Just like the blk_mq_ctx counterparts, we've got a bunch of counters in here that are only for debugfs and are of questionnable value. They are: - dispatched, index of how many requests were dispatched in one go - poll_{considered,invoked,success}, which track poll sucess rates. We're confident in the iopoll implementation at this point, don't bother tracking these. As a bonus, this shrinks each hardware queue from 576 bytes to 512 bytes, dropping a whole cacheline. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 95c3bd3a008e..9fb8618fb957 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -341,9 +341,6 @@ struct blk_mq_hw_ctx { unsigned long queued; /** @run: Number of dispatched requests. */ unsigned long run; -#define BLK_MQ_MAX_DISPATCH_ORDER 7 - /** @dispatched: Number of dispatch requests by queue. */ - unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; @@ -363,13 +360,6 @@ struct blk_mq_hw_ctx { /** @kobj: Kernel object for sysfs. */ struct kobject kobj; - /** @poll_considered: Count times blk_mq_poll() was called. */ - unsigned long poll_considered; - /** @poll_invoked: Count how many requests blk_mq_poll() polled. */ - unsigned long poll_invoked; - /** @poll_success: Count how many polled requests were completed. */ - unsigned long poll_success; - #ifdef CONFIG_BLK_DEBUG_FS /** * @debugfs_dir: debugfs directory for this hardware queue. Named -- cgit v1.2.3 From 013a7f95438144f4ab39a1017a0bff2765d2551a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Oct 2021 07:58:52 -0600 Subject: block: provide helpers for rq_list manipulation Instead of open-coding the list additions, traversal, and removal, provide a basic set of helpers. Suggested-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d5b21fc8f49e..b0a322172965 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1298,4 +1298,33 @@ int fsync_bdev(struct block_device *bdev); int freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev); +#define rq_list_add(listptr, rq) do { \ + (rq)->rq_next = *(listptr); \ + *(listptr) = rq; \ +} while (0) + +#define rq_list_pop(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) { \ + __req = *(listptr); \ + *(listptr) = __req->rq_next; \ + } \ + __req; \ +}) + +#define rq_list_peek(listptr) \ +({ \ + struct request *__req = NULL; \ + if ((listptr) && *(listptr)) \ + __req = *(listptr); \ + __req; \ +}) + +#define rq_list_for_each(listptr, pos) \ + for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) \ + +#define rq_list_next(rq) (rq)->rq_next +#define rq_list_empty(list) ((list) == (struct request *) NULL) + #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 5a72e899ceb465d731c413d57c6c12cdbf88303c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Oct 2021 09:24:29 -0600 Subject: block: add a struct io_comp_batch argument to fops->iopoll() struct io_comp_batch contains a list head and a completion handler, which will allow completions to more effciently completed batches of IO. For now, no functional changes in this patch, we just define the io_comp_batch structure and add the argument to the file_operations iopoll handler. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- include/linux/blkdev.h | 13 +++++++++++-- include/linux/fs.h | 4 +++- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9fb8618fb957..4c79439af2f2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -532,7 +532,7 @@ struct blk_mq_ops { /** * @poll: Called to poll for completion of a specific tag. */ - int (*poll)(struct blk_mq_hw_ctx *); + int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); /** * @complete: Mark the request as complete. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b0a322172965..fd9771a1da09 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -569,8 +569,9 @@ blk_status_t errno_to_blk_status(int errno); #define BLK_POLL_ONESHOT (1 << 0) /* do not sleep to wait for the expected completion time */ #define BLK_POLL_NOSLEEP (1 << 1) -int bio_poll(struct bio *bio, unsigned int flags); -int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags); +int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); +int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, + unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { @@ -1298,6 +1299,14 @@ int fsync_bdev(struct block_device *bdev); int freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev); +struct io_comp_batch { + struct request *req_list; + bool need_ts; + void (*complete)(struct io_comp_batch *); +}; + +#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } + #define rq_list_add(listptr, rq) do { \ (rq)->rq_next = *(listptr); \ *(listptr) = rq; \ diff --git a/include/linux/fs.h b/include/linux/fs.h index f595f4097cb7..31029a91f440 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -48,6 +48,7 @@ struct backing_dev_info; struct bdi_writeback; struct bio; +struct io_comp_batch; struct export_operations; struct fiemap_extent_info; struct hd_geometry; @@ -2071,7 +2072,8 @@ struct file_operations { ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); - int (*iopoll)(struct kiocb *kiocb, unsigned int flags); + int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, + unsigned int flags); int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); -- cgit v1.2.3 From 1aec5e4a2962f7e0b3fb3e7308dd726be2472c26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 8 Oct 2021 05:44:23 -0600 Subject: sbitmap: add helper to clear a batch of tags sbitmap currently only supports clearing tags one-by-one, add a helper that allows the caller to pass in an array of tags to clear. Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index e30b56023ead..4a6ff274335a 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -528,6 +528,17 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu); +/** + * sbitmap_queue_clear_batch() - Free a batch of allocated bits + * &struct sbitmap_queue. + * @sbq: Bitmap to free from. + * @offset: offset for each tag in array + * @tags: array of tags + * @nr_tags: number of tags in array + */ +void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, + int *tags, int nr_tags); + static inline int sbq_index_inc(int index) { return (index + 1) & (SBQ_WAIT_QUEUES - 1); -- cgit v1.2.3 From f794f3351f2672d782b8df0fa59f3cef38cffa59 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 8 Oct 2021 05:50:46 -0600 Subject: block: add support for blk_mq_end_request_batch() Instead of calling blk_mq_end_request() on a single request, add a helper that takes the new struct io_comp_batch and completes any request stored in there. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4c79439af2f2..656fe34bdb6c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -728,6 +728,35 @@ static inline void blk_mq_set_request_complete(struct request *rq) void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); +void blk_mq_end_request_batch(struct io_comp_batch *ib); + +/* + * Only need start/end time stamping if we have iostat or + * blk stats enabled, or using an IO scheduler. + */ +static inline bool blk_mq_need_time_stamp(struct request *rq) +{ + return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV)); +} + +/* + * Batched completions only work when there is no I/O error and no special + * ->end_io handler. + */ +static inline bool blk_mq_add_to_batch(struct request *req, + struct io_comp_batch *iob, int ioerror, + void (*complete)(struct io_comp_batch *)) +{ + if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror) + return false; + if (!iob->complete) + iob->complete = complete; + else if (iob->complete != complete) + return false; + iob->need_ts |= blk_mq_need_time_stamp(req); + rq_list_add(&iob->req_list, req); + return true; +} void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); -- cgit v1.2.3 From 99457db8b40c66941072a383a5ab4e36bc53fd3d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Oct 2021 12:11:01 +0200 Subject: block: move the SECTOR_SIZE related definitions to blk_types.h Ensure these are always available for inlines in the various block layer headers. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20211018101130.1838532-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 17 +++++++++++++++++ include/linux/blkdev.h | 17 ----------------- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1e370929c89e..472e55e0e94f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -20,6 +20,23 @@ struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; +/* + * The basic unit of block I/O is a sector. It is used in a number of contexts + * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9 + * bytes. Variables of type sector_t represent an offset or size that is a + * multiple of 512 bytes. Hence these two constants. + */ +#ifndef SECTOR_SHIFT +#define SECTOR_SHIFT 9 +#endif +#ifndef SECTOR_SIZE +#define SECTOR_SIZE (1 << SECTOR_SHIFT) +#endif + +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +#define SECTOR_MASK (PAGE_SECTORS - 1) + struct block_device { sector_t bd_start_sect; struct disk_stats __percpu *bd_stats; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fd9771a1da09..abe721591e80 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -578,23 +578,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) return bdev->bd_queue; /* this is never NULL */ } -/* - * The basic unit of block I/O is a sector. It is used in a number of contexts - * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9 - * bytes. Variables of type sector_t represent an offset or size that is a - * multiple of 512 bytes. Hence these two constants. - */ -#ifndef SECTOR_SHIFT -#define SECTOR_SHIFT 9 -#endif -#ifndef SECTOR_SIZE -#define SECTOR_SIZE (1 << SECTOR_SHIFT) -#endif - -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) -#define SECTOR_MASK (PAGE_SECTORS - 1) - #ifdef CONFIG_BLK_DEV_ZONED /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ -- cgit v1.2.3 From 6436bd90f76e75d2c5786a50203b05a9b7f7100d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Oct 2021 12:11:02 +0200 Subject: block: add a bdev_nr_bytes helper Add a helper to query the size of a block device in bytes. This will be used to remove open coded access to ->bd_inode. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20211018101130.1838532-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index cd4038fd5743..01d27f3a970e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -236,9 +236,14 @@ static inline sector_t get_start_sect(struct block_device *bdev) return bdev->bd_start_sect; } +static inline loff_t bdev_nr_bytes(struct block_device *bdev) +{ + return i_size_read(bdev->bd_inode); +} + static inline sector_t bdev_nr_sectors(struct block_device *bdev) { - return i_size_read(bdev->bd_inode) >> 9; + return bdev_nr_bytes(bdev) >> SECTOR_SHIFT; } static inline sector_t get_capacity(struct gendisk *disk) -- cgit v1.2.3 From bcc6e2cfaa48a4ad2e17719194f6d97a8e03f6c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Oct 2021 12:11:25 +0200 Subject: block: add a sb_bdev_nr_blocks helper Add a helper to return the size of sb->s_bdev in sb->s_blocksize_bits based unites. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20211018101130.1838532-26-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 01d27f3a970e..7b0326661a1e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -251,6 +251,12 @@ static inline sector_t get_capacity(struct gendisk *disk) return bdev_nr_sectors(disk->part0); } +static inline u64 sb_bdev_nr_blocks(struct super_block *sb) +{ + return bdev_nr_sectors(sb->s_bdev) >> + (sb->s_blocksize_bits - SECTOR_SHIFT); +} + int bdev_disk_changed(struct gendisk *disk, bool invalidate); void blk_drop_partitions(struct gendisk *disk); -- cgit v1.2.3 From f09313c57a17683cbcb305989daf1d94b49fd32c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Oct 2021 11:39:45 -0600 Subject: block: cache inode size in bdev Reading the inode size brings in a new cacheline for IO submit, and it's in the hot path being checked for every single IO. When doing millions of IOs per core per second, this is noticeable overhead. Cache the nr_sectors in the bdev itself. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 + include/linux/genhd.h | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 472e55e0e94f..fe065c394fff 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -39,6 +39,7 @@ struct bio_crypt_ctx; struct block_device { sector_t bd_start_sect; + sector_t bd_nr_sectors; struct disk_stats __percpu *bd_stats; unsigned long bd_stamp; bool bd_read_only; /* read-only policy */ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7b0326661a1e..900325aa5d31 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -236,14 +236,14 @@ static inline sector_t get_start_sect(struct block_device *bdev) return bdev->bd_start_sect; } -static inline loff_t bdev_nr_bytes(struct block_device *bdev) +static inline sector_t bdev_nr_sectors(struct block_device *bdev) { - return i_size_read(bdev->bd_inode); + return bdev->bd_nr_sectors; } -static inline sector_t bdev_nr_sectors(struct block_device *bdev) +static inline loff_t bdev_nr_bytes(struct block_device *bdev) { - return bdev_nr_bytes(bdev) >> SECTOR_SHIFT; + return bdev_nr_sectors(bdev) << SECTOR_SHIFT; } static inline sector_t get_capacity(struct gendisk *disk) -- cgit v1.2.3 From 15bc01effefe97757ef02ca09e9d1b927ab22725 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 16 Oct 2021 15:59:49 -0500 Subject: ucounts: Fix signal ucount refcounting In commit fda31c50292a ("signal: avoid double atomic counter increments for user accounting") Linus made a clever optimization to how rlimits and the struct user_struct. Unfortunately that optimization does not work in the obvious way when moved to nested rlimits. The problem is that the last decrement of the per user namespace per user sigpending counter might also be the last decrement of the sigpending counter in the parent user namespace as well. Which means that simply freeing the leaf ucount in __free_sigqueue is not enough. Maintain the optimization and handle the tricky cases by introducing inc_rlimit_get_ucounts and dec_rlimit_put_ucounts. By moving the entire optimization into functions that perform all of the work it becomes possible to ensure that every level is handled properly. The new function inc_rlimit_get_ucounts returns 0 on failure to increment the ucount. This is different than inc_rlimit_ucounts which increments the ucounts and returns LONG_MAX if the ucount counter has exceeded it's maximum or it wrapped (to indicate the counter needs to decremented). I wish we had a single user to account all pending signals to across all of the threads of a process so this complexity was not necessary Cc: stable@vger.kernel.org Fixes: d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of ucounts") v1: https://lkml.kernel.org/r/87mtnavszx.fsf_-_@disp2133 Link: https://lkml.kernel.org/r/87fssytizw.fsf_-_@disp2133 Reviewed-by: Alexey Gladkov Tested-by: Rune Kleveland Tested-by: Yu Zhao Tested-by: Jordan Glover Signed-off-by: "Eric W. Biederman" --- include/linux/user_namespace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index eb70cabe6e7f..33a4240e6a6f 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -127,6 +127,8 @@ static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type t long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v); +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type); +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type); bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max); static inline void set_rlimit_ucount_max(struct user_namespace *ns, -- cgit v1.2.3 From ed65df63a39a3f6ed04f7258de8b6789e5021c18 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 18 Oct 2021 15:44:12 -0400 Subject: tracing: Have all levels of checks prevent recursion While writing an email explaining the "bit = 0" logic for a discussion on making ftrace_test_recursion_trylock() disable preemption, I discovered a path that makes the "not do the logic if bit is zero" unsafe. The recursion logic is done in hot paths like the function tracer. Thus, any code executed causes noticeable overhead. Thus, tricks are done to try to limit the amount of code executed. This included the recursion testing logic. Having recursion testing is important, as there are many paths that can end up in an infinite recursion cycle when tracing every function in the kernel. Thus protection is needed to prevent that from happening. Because it is OK to recurse due to different running context levels (e.g. an interrupt preempts a trace, and then a trace occurs in the interrupt handler), a set of bits are used to know which context one is in (normal, softirq, irq and NMI). If a recursion occurs in the same level, it is prevented*. Then there are infrastructure levels of recursion as well. When more than one callback is attached to the same function to trace, it calls a loop function to iterate over all the callbacks. Both the callbacks and the loop function have recursion protection. The callbacks use the "ftrace_test_recursion_trylock()" which has a "function" set of context bits to test, and the loop function calls the internal trace_test_and_set_recursion() directly, with an "internal" set of bits. If an architecture does not implement all the features supported by ftrace then the callbacks are never called directly, and the loop function is called instead, which will implement the features of ftrace. Since both the loop function and the callbacks do recursion protection, it was seemed unnecessary to do it in both locations. Thus, a trick was made to have the internal set of recursion bits at a more significant bit location than the function bits. Then, if any of the higher bits were set, the logic of the function bits could be skipped, as any new recursion would first have to go through the loop function. This is true for architectures that do not support all the ftrace features, because all functions being traced must first go through the loop function before going to the callbacks. But this is not true for architectures that support all the ftrace features. That's because the loop function could be called due to two callbacks attached to the same function, but then a recursion function inside the callback could be called that does not share any other callback, and it will be called directly. i.e. traced_function_1: [ more than one callback tracing it ] call loop_func loop_func: trace_recursion set internal bit call callback callback: trace_recursion [ skipped because internal bit is set, return 0 ] call traced_function_2 traced_function_2: [ only traced by above callback ] call callback callback: trace_recursion [ skipped because internal bit is set, return 0 ] call traced_function_2 [ wash, rinse, repeat, BOOM! out of shampoo! ] Thus, the "bit == 0 skip" trick is not safe, unless the loop function is call for all functions. Since we want to encourage architectures to implement all ftrace features, having them slow down due to this extra logic may encourage the maintainers to update to the latest ftrace features. And because this logic is only safe for them, remove it completely. [*] There is on layer of recursion that is allowed, and that is to allow for the transition between interrupt context (normal -> softirq -> irq -> NMI), because a trace may occur before the context update is visible to the trace recursion logic. Link: https://lore.kernel.org/all/609b565a-ed6e-a1da-f025-166691b5d994@linux.alibaba.com/ Link: https://lkml.kernel.org/r/20211018154412.09fcad3c@gandalf.local.home Cc: Linus Torvalds Cc: Petr Mladek Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Joe Lawrence Cc: Colin Ian King Cc: Masami Hiramatsu Cc: "Peter Zijlstra (Intel)" Cc: Nicholas Piggin Cc: Jisheng Zhang Cc: =?utf-8?b?546L6LSH?= Cc: Guo Ren Cc: stable@vger.kernel.org Fixes: edc15cafcbfa3 ("tracing: Avoid unnecessary multiple recursion checks") Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_recursion.h | 49 ++++++++--------------------------------- 1 file changed, 9 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index a9f9c5714e65..fe95f0922526 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -16,23 +16,8 @@ * When function tracing occurs, the following steps are made: * If arch does not support a ftrace feature: * call internal function (uses INTERNAL bits) which calls... - * If callback is registered to the "global" list, the list - * function is called and recursion checks the GLOBAL bits. - * then this function calls... * The function callback, which can use the FTRACE bits to * check for recursion. - * - * Now if the arch does not support a feature, and it calls - * the global list function which calls the ftrace callback - * all three of these steps will do a recursion protection. - * There's no reason to do one if the previous caller already - * did. The recursion that we are protecting against will - * go through the same steps again. - * - * To prevent the multiple recursion checks, if a recursion - * bit is set that is higher than the MAX bit of the current - * check, then we know that the check was made by the previous - * caller, and we can skip the current check. */ enum { /* Function recursion bits */ @@ -40,12 +25,14 @@ enum { TRACE_FTRACE_NMI_BIT, TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, + TRACE_FTRACE_TRANSITION_BIT, - /* INTERNAL_BITs must be greater than FTRACE_BITs */ + /* Internal use recursion bits */ TRACE_INTERNAL_BIT, TRACE_INTERNAL_NMI_BIT, TRACE_INTERNAL_IRQ_BIT, TRACE_INTERNAL_SIRQ_BIT, + TRACE_INTERNAL_TRANSITION_BIT, TRACE_BRANCH_BIT, /* @@ -86,12 +73,6 @@ enum { */ TRACE_GRAPH_NOTRACE_BIT, - /* - * When transitioning between context, the preempt_count() may - * not be correct. Allow for a single recursion to cover this case. - */ - TRACE_TRANSITION_BIT, - /* Used to prevent recursion recording from recursing. */ TRACE_RECORD_RECURSION_BIT, }; @@ -113,12 +94,10 @@ enum { #define TRACE_CONTEXT_BITS 4 #define TRACE_FTRACE_START TRACE_FTRACE_BIT -#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) #define TRACE_LIST_START TRACE_INTERNAL_BIT -#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) -#define TRACE_CONTEXT_MASK TRACE_LIST_MAX +#define TRACE_CONTEXT_MASK ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) /* * Used for setting context @@ -132,6 +111,7 @@ enum { TRACE_CTX_IRQ, TRACE_CTX_SOFTIRQ, TRACE_CTX_NORMAL, + TRACE_CTX_TRANSITION, }; static __always_inline int trace_get_context_bit(void) @@ -160,45 +140,34 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); #endif static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip, - int start, int max) + int start) { unsigned int val = READ_ONCE(current->trace_recursion); int bit; - /* A previous recursion check was made */ - if ((val & TRACE_CONTEXT_MASK) > max) - return 0; - bit = trace_get_context_bit() + start; if (unlikely(val & (1 << bit))) { /* * It could be that preempt_count has not been updated during * a switch between contexts. Allow for a single recursion. */ - bit = TRACE_TRANSITION_BIT; + bit = TRACE_CTX_TRANSITION + start; if (val & (1 << bit)) { do_ftrace_record_recursion(ip, pip); return -1; } - } else { - /* Normal check passed, clear the transition to allow it again */ - val &= ~(1 << TRACE_TRANSITION_BIT); } val |= 1 << bit; current->trace_recursion = val; barrier(); - return bit + 1; + return bit; } static __always_inline void trace_clear_recursion(int bit) { - if (!bit) - return; - barrier(); - bit--; trace_recursion_clear(bit); } @@ -214,7 +183,7 @@ static __always_inline void trace_clear_recursion(int bit) static __always_inline int ftrace_test_recursion_trylock(unsigned long ip, unsigned long parent_ip) { - return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, TRACE_FTRACE_MAX); + return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START); } /** -- cgit v1.2.3 From 425a563acb1d1872c0036fbcb644247640edbbc6 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sun, 23 May 2021 13:34:28 +0300 Subject: net/mlx5: Introduce port selection namespace Add new port selection flow steering namespace. Flow steering rules in this namespaceare are used to determine the physical port for egress packets. Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 15 +++++++++++++++ include/linux/mlx5/fs.h | 1 + include/linux/mlx5/mlx5_ifc.h | 25 +++++++++++++++++++++++-- 3 files changed, 39 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 109cc8106d16..347167c18802 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1185,6 +1185,7 @@ enum mlx5_cap_type { MLX5_CAP_DEV_EVENT = 0x14, MLX5_CAP_IPSEC, MLX5_CAP_GENERAL_2 = 0x20, + MLX5_CAP_PORT_SELECTION = 0x25, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1342,6 +1343,20 @@ enum mlx5_qcam_feature_groups { MLX5_GET(e_switch_cap, \ mdev->caps.hca[MLX5_CAP_ESWITCH]->max, cap) +#define MLX5_CAP_PORT_SELECTION(mdev, cap) \ + MLX5_GET(port_selection_cap, \ + mdev->caps.hca[MLX5_CAP_PORT_SELECTION]->cur, cap) + +#define MLX5_CAP_PORT_SELECTION_MAX(mdev, cap) \ + MLX5_GET(port_selection_cap, \ + mdev->caps.hca[MLX5_CAP_PORT_SELECTION]->max, cap) + +#define MLX5_CAP_FLOWTABLE_PORT_SELECTION(mdev, cap) \ + MLX5_CAP_PORT_SELECTION(mdev, flow_table_properties_port_selection.cap) + +#define MLX5_CAP_FLOWTABLE_PORT_SELECTION_MAX(mdev, cap) \ + MLX5_CAP_PORT_SELECTION_MAX(mdev, flow_table_properties_port_selection.cap) + #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, cap) diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 0106c67e8ccb..259fcc168340 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -83,6 +83,7 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_RDMA_RX, MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL, MLX5_FLOW_NAMESPACE_RDMA_TX, + MLX5_FLOW_NAMESPACE_PORT_SEL, }; enum { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c614ad1da44d..db1d9c69c1fa 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -767,6 +767,18 @@ struct mlx5_ifc_flow_table_nic_cap_bits { u8 reserved_at_20c0[0x5f40]; }; +struct mlx5_ifc_port_selection_cap_bits { + u8 reserved_at_0[0x10]; + u8 port_select_flow_table[0x1]; + u8 reserved_at_11[0xf]; + + u8 reserved_at_20[0x1e0]; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_port_selection; + + u8 reserved_at_400[0x7c00]; +}; + enum { MLX5_FDB_TO_VPORT_REG_C_0 = 0x01, MLX5_FDB_TO_VPORT_REG_C_1 = 0x02, @@ -1515,7 +1527,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 uar_4k[0x1]; u8 reserved_at_241[0x9]; u8 uar_sz[0x6]; - u8 reserved_at_248[0x2]; + u8 port_selection_cap[0x1]; + u8 reserved_at_248[0x1]; u8 umem_uid_0[0x1]; u8 reserved_at_250[0x5]; u8 log_pg_sz[0x8]; @@ -3164,6 +3177,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; struct mlx5_ifc_e_switch_cap_bits e_switch_cap; + struct mlx5_ifc_port_selection_cap_bits port_selection_cap; struct mlx5_ifc_vector_calc_cap_bits vector_calc_cap; struct mlx5_ifc_qos_cap_bits qos_cap; struct mlx5_ifc_debug_cap_bits debug_cap; @@ -10434,9 +10448,16 @@ struct mlx5_ifc_dcbx_param_bits { u8 reserved_at_a0[0x160]; }; +enum { + MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY = 0, + MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT, +}; + struct mlx5_ifc_lagc_bits { u8 fdb_selection_mode[0x1]; - u8 reserved_at_1[0x1c]; + u8 reserved_at_1[0x14]; + u8 port_select_mode[0x3]; + u8 reserved_at_18[0x5]; u8 lag_state[0x3]; u8 reserved_at_20[0x14]; -- cgit v1.2.3 From e7e2519e3632396a25031b7e828ed35332e5dd07 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 6 Jul 2021 17:48:26 +0300 Subject: net/mlx5: Add support to create match definer Introduce new APIs to create and destroy flow matcher for given format id. Flow match definer object is used for defining the fields and mask used for the hash calculation. User should mask the desired fields like done in the match criteria. This object is assigned to flow group of type hash. In this flow group type, packets lookup is done based on the hash result. This patch also adds the required bits to create such flow group. Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 8 ++ include/linux/mlx5/mlx5_ifc.h | 272 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 253 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 259fcc168340..7a43fec63a35 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -98,6 +98,7 @@ enum { struct mlx5_pkt_reformat; struct mlx5_modify_hdr; +struct mlx5_flow_definer; struct mlx5_flow_table; struct mlx5_flow_group; struct mlx5_flow_namespace; @@ -258,6 +259,13 @@ struct mlx5_modify_hdr *mlx5_modify_header_alloc(struct mlx5_core_dev *dev, void *modify_actions); void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev, struct mlx5_modify_hdr *modify_hdr); +struct mlx5_flow_definer * +mlx5_create_match_definer(struct mlx5_core_dev *dev, + enum mlx5_flow_namespace_type ns_type, u16 format_id, + u32 *match_mask); +void mlx5_destroy_match_definer(struct mlx5_core_dev *dev, + struct mlx5_flow_definer *definer); +int mlx5_get_match_definer_id(struct mlx5_flow_definer *definer); struct mlx5_pkt_reformat_params { int type; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index db1d9c69c1fa..8f41145bc6ef 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -94,6 +94,7 @@ enum { enum { MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b, MLX5_OBJ_TYPE_VIRTIO_NET_Q = 0x000d, + MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018, MLX5_OBJ_TYPE_MKEY = 0xff01, MLX5_OBJ_TYPE_QP = 0xff02, MLX5_OBJ_TYPE_PSV = 0xff03, @@ -1731,7 +1732,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 flex_parser_id_outer_first_mpls_over_gre[0x4]; u8 flex_parser_id_outer_first_mpls_over_udp_label[0x4]; - u8 reserved_at_6e0[0x10]; + u8 max_num_match_definer[0x10]; u8 sf_base_id[0x10]; u8 flex_parser_id_gtpu_dw_2[0x4]; @@ -1746,7 +1747,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_760[0x20]; u8 vhca_tunnel_commands[0x40]; - u8 reserved_at_7c0[0x40]; + u8 match_definer_format_supported[0x40]; }; struct mlx5_ifc_cmd_hca_cap_2_bits { @@ -5666,6 +5667,236 @@ struct mlx5_ifc_query_fte_in_bits { u8 reserved_at_120[0xe0]; }; +struct mlx5_ifc_match_definer_format_0_bits { + u8 reserved_at_0[0x100]; + + u8 metadata_reg_c_0[0x20]; + + u8 metadata_reg_c_1[0x20]; + + u8 outer_dmac_47_16[0x20]; + + u8 outer_dmac_15_0[0x10]; + u8 outer_ethertype[0x10]; + + u8 reserved_at_180[0x1]; + u8 sx_sniffer[0x1]; + u8 functional_lb[0x1]; + u8 outer_ip_frag[0x1]; + u8 outer_qp_type[0x2]; + u8 outer_encap_type[0x2]; + u8 port_number[0x2]; + u8 outer_l3_type[0x2]; + u8 outer_l4_type[0x2]; + u8 outer_first_vlan_type[0x2]; + u8 outer_first_vlan_prio[0x3]; + u8 outer_first_vlan_cfi[0x1]; + u8 outer_first_vlan_vid[0xc]; + + u8 outer_l4_type_ext[0x4]; + u8 reserved_at_1a4[0x2]; + u8 outer_ipsec_layer[0x2]; + u8 outer_l2_type[0x2]; + u8 force_lb[0x1]; + u8 outer_l2_ok[0x1]; + u8 outer_l3_ok[0x1]; + u8 outer_l4_ok[0x1]; + u8 outer_second_vlan_type[0x2]; + u8 outer_second_vlan_prio[0x3]; + u8 outer_second_vlan_cfi[0x1]; + u8 outer_second_vlan_vid[0xc]; + + u8 outer_smac_47_16[0x20]; + + u8 outer_smac_15_0[0x10]; + u8 inner_ipv4_checksum_ok[0x1]; + u8 inner_l4_checksum_ok[0x1]; + u8 outer_ipv4_checksum_ok[0x1]; + u8 outer_l4_checksum_ok[0x1]; + u8 inner_l3_ok[0x1]; + u8 inner_l4_ok[0x1]; + u8 outer_l3_ok_duplicate[0x1]; + u8 outer_l4_ok_duplicate[0x1]; + u8 outer_tcp_cwr[0x1]; + u8 outer_tcp_ece[0x1]; + u8 outer_tcp_urg[0x1]; + u8 outer_tcp_ack[0x1]; + u8 outer_tcp_psh[0x1]; + u8 outer_tcp_rst[0x1]; + u8 outer_tcp_syn[0x1]; + u8 outer_tcp_fin[0x1]; +}; + +struct mlx5_ifc_match_definer_format_22_bits { + u8 reserved_at_0[0x100]; + + u8 outer_ip_src_addr[0x20]; + + u8 outer_ip_dest_addr[0x20]; + + u8 outer_l4_sport[0x10]; + u8 outer_l4_dport[0x10]; + + u8 reserved_at_160[0x1]; + u8 sx_sniffer[0x1]; + u8 functional_lb[0x1]; + u8 outer_ip_frag[0x1]; + u8 outer_qp_type[0x2]; + u8 outer_encap_type[0x2]; + u8 port_number[0x2]; + u8 outer_l3_type[0x2]; + u8 outer_l4_type[0x2]; + u8 outer_first_vlan_type[0x2]; + u8 outer_first_vlan_prio[0x3]; + u8 outer_first_vlan_cfi[0x1]; + u8 outer_first_vlan_vid[0xc]; + + u8 metadata_reg_c_0[0x20]; + + u8 outer_dmac_47_16[0x20]; + + u8 outer_smac_47_16[0x20]; + + u8 outer_smac_15_0[0x10]; + u8 outer_dmac_15_0[0x10]; +}; + +struct mlx5_ifc_match_definer_format_23_bits { + u8 reserved_at_0[0x100]; + + u8 inner_ip_src_addr[0x20]; + + u8 inner_ip_dest_addr[0x20]; + + u8 inner_l4_sport[0x10]; + u8 inner_l4_dport[0x10]; + + u8 reserved_at_160[0x1]; + u8 sx_sniffer[0x1]; + u8 functional_lb[0x1]; + u8 inner_ip_frag[0x1]; + u8 inner_qp_type[0x2]; + u8 inner_encap_type[0x2]; + u8 port_number[0x2]; + u8 inner_l3_type[0x2]; + u8 inner_l4_type[0x2]; + u8 inner_first_vlan_type[0x2]; + u8 inner_first_vlan_prio[0x3]; + u8 inner_first_vlan_cfi[0x1]; + u8 inner_first_vlan_vid[0xc]; + + u8 tunnel_header_0[0x20]; + + u8 inner_dmac_47_16[0x20]; + + u8 inner_smac_47_16[0x20]; + + u8 inner_smac_15_0[0x10]; + u8 inner_dmac_15_0[0x10]; +}; + +struct mlx5_ifc_match_definer_format_29_bits { + u8 reserved_at_0[0xc0]; + + u8 outer_ip_dest_addr[0x80]; + + u8 outer_ip_src_addr[0x80]; + + u8 outer_l4_sport[0x10]; + u8 outer_l4_dport[0x10]; + + u8 reserved_at_1e0[0x20]; +}; + +struct mlx5_ifc_match_definer_format_30_bits { + u8 reserved_at_0[0xa0]; + + u8 outer_ip_dest_addr[0x80]; + + u8 outer_ip_src_addr[0x80]; + + u8 outer_dmac_47_16[0x20]; + + u8 outer_smac_47_16[0x20]; + + u8 outer_smac_15_0[0x10]; + u8 outer_dmac_15_0[0x10]; +}; + +struct mlx5_ifc_match_definer_format_31_bits { + u8 reserved_at_0[0xc0]; + + u8 inner_ip_dest_addr[0x80]; + + u8 inner_ip_src_addr[0x80]; + + u8 inner_l4_sport[0x10]; + u8 inner_l4_dport[0x10]; + + u8 reserved_at_1e0[0x20]; +}; + +struct mlx5_ifc_match_definer_format_32_bits { + u8 reserved_at_0[0xa0]; + + u8 inner_ip_dest_addr[0x80]; + + u8 inner_ip_src_addr[0x80]; + + u8 inner_dmac_47_16[0x20]; + + u8 inner_smac_47_16[0x20]; + + u8 inner_smac_15_0[0x10]; + u8 inner_dmac_15_0[0x10]; +}; + +struct mlx5_ifc_match_definer_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x40]; + + u8 reserved_at_80[0x10]; + u8 format_id[0x10]; + + u8 reserved_at_a0[0x160]; + + u8 match_mask[16][0x20]; +}; + +struct mlx5_ifc_general_obj_in_cmd_hdr_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 vhca_tunnel_id[0x10]; + u8 obj_type[0x10]; + + u8 obj_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_general_obj_out_cmd_hdr_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 obj_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_match_definer_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + + struct mlx5_ifc_match_definer_bits obj_context; +}; + +struct mlx5_ifc_create_match_definer_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr; +}; + enum { MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_OUTER_HEADERS = 0x0, MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS = 0x1, @@ -8139,6 +8370,11 @@ struct mlx5_ifc_create_flow_group_out_bits { u8 reserved_at_60[0x20]; }; +enum { + MLX5_CREATE_FLOW_GROUP_IN_GROUP_TYPE_TCAM_SUBTABLE = 0x0, + MLX5_CREATE_FLOW_GROUP_IN_GROUP_TYPE_HASH_SPLIT = 0x1, +}; + enum { MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS = 0x0, MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS = 0x1, @@ -8160,7 +8396,9 @@ struct mlx5_ifc_create_flow_group_in_bits { u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x4]; + u8 group_type[0x4]; + u8 reserved_at_90[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -8175,7 +8413,10 @@ struct mlx5_ifc_create_flow_group_in_bits { u8 end_flow_index[0x20]; - u8 reserved_at_140[0xa0]; + u8 reserved_at_140[0x10]; + u8 match_definer_id[0x10]; + + u8 reserved_at_160[0x80]; u8 reserved_at_1e0[0x18]; u8 match_criteria_enable[0x8]; @@ -10671,29 +10912,6 @@ struct mlx5_ifc_dealloc_memic_out_bits { u8 reserved_at_40[0x40]; }; -struct mlx5_ifc_general_obj_in_cmd_hdr_bits { - u8 opcode[0x10]; - u8 uid[0x10]; - - u8 vhca_tunnel_id[0x10]; - u8 obj_type[0x10]; - - u8 obj_id[0x20]; - - u8 reserved_at_60[0x20]; -}; - -struct mlx5_ifc_general_obj_out_cmd_hdr_bits { - u8 status[0x8]; - u8 reserved_at_8[0x18]; - - u8 syndrome[0x20]; - - u8 obj_id[0x20]; - - u8 reserved_at_60[0x20]; -}; - struct mlx5_ifc_umem_bits { u8 reserved_at_0[0x80]; -- cgit v1.2.3 From 58a606dba708f114e7f412b5c3024027e8a73e34 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 3 Aug 2021 10:04:41 +0300 Subject: net/mlx5: Introduce new uplink destination type The uplink destination type should be used in rules to steer the packet to the uplink when the device is in steering based LAG mode. Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 8f41145bc6ef..09e43019d877 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1766,6 +1766,7 @@ enum mlx5_flow_destination_type { MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, MLX5_FLOW_DESTINATION_TYPE_TIR = 0x2, MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6, + MLX5_FLOW_DESTINATION_TYPE_UPLINK = 0x8, MLX5_FLOW_DESTINATION_TYPE_PORT = 0x99, MLX5_FLOW_DESTINATION_TYPE_COUNTER = 0x100, -- cgit v1.2.3 From 76af6a054da4055305ddb28c5eb151b9ee4f74f9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 18 Oct 2021 15:15:32 -0700 Subject: mm/migrate: add CPU hotplug to demotion #ifdef Once upon a time, the node demotion updates were driven solely by memory hotplug events. But now, there are handlers for both CPU and memory hotplug. However, the #ifdef around the code checks only memory hotplug. A system that has HOTPLUG_CPU=y but MEMORY_HOTPLUG=n would miss CPU hotplug events. Update the #ifdef around the common code. Add memory and CPU-specific #ifdefs for their handlers. These memory/CPU #ifdefs avoid unused function warnings when their Kconfig option is off. [arnd@arndb.de: rework hotplug_memory_notifier() stub] Link: https://lkml.kernel.org/r/20211013144029.2154629-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20210924161255.E5FE8F7E@davehans-spike.ostc.intel.com Fixes: 884a6e5d1f93 ("mm/migrate: update node demotion order on hotplug events") Signed-off-by: Dave Hansen Signed-off-by: Arnd Bergmann Cc: "Huang, Ying" Cc: Michal Hocko Cc: Wei Xu Cc: Oscar Salvador Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 7efc0a7c14c9..182c606adb06 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -160,7 +160,10 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, #define register_hotmemory_notifier(nb) register_memory_notifier(nb) #define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb) #else -#define hotplug_memory_notifier(fn, pri) ({ 0; }) +static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) +{ + return 0; +} /* These aren't inline functions due to a GCC bug. */ #define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) #define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) -- cgit v1.2.3 From a6a0251c6fce496744121b4e08c899f45270dbcc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 18 Oct 2021 15:15:35 -0700 Subject: mm/migrate: fix CPUHP state to update node demotion order The node demotion order needs to be updated during CPU hotplug. Because whether a NUMA node has CPU may influence the demotion order. The update function should be called during CPU online/offline after the node_states[N_CPU] has been updated. That is done in CPUHP_AP_ONLINE_DYN during CPU online and in CPUHP_MM_VMSTAT_DEAD during CPU offline. But in commit 884a6e5d1f93 ("mm/migrate: update node demotion order on hotplug events"), the function to update node demotion order is called in CPUHP_AP_ONLINE_DYN during CPU online/offline. This doesn't satisfy the order requirement. For example, there are 4 CPUs (P0, P1, P2, P3) in 2 sockets (P0, P1 in S0 and P2, P3 in S1), the demotion order is - S0 -> NUMA_NO_NODE - S1 -> NUMA_NO_NODE After P2 and P3 is offlined, because S1 has no CPU now, the demotion order should have been changed to - S0 -> S1 - S1 -> NO_NODE but it isn't changed, because the order updating callback for CPU hotplug doesn't see the new nodemask. After that, if P1 is offlined, the demotion order is changed to the expected order as above. So in this patch, we added CPUHP_AP_MM_DEMOTION_ONLINE and CPUHP_MM_DEMOTION_DEAD to be called after CPUHP_AP_ONLINE_DYN and CPUHP_MM_VMSTAT_DEAD during CPU online and offline, and register the update function on them. Link: https://lkml.kernel.org/r/20210929060351.7293-1-ying.huang@intel.com Fixes: 884a6e5d1f93 ("mm/migrate: update node demotion order on hotplug events") Signed-off-by: "Huang, Ying" Cc: Thomas Gleixner Cc: Dave Hansen Cc: Yang Shi Cc: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Oscar Salvador Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuhotplug.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 832d8a74fa59..991911048857 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -72,6 +72,8 @@ enum cpuhp_state { CPUHP_SLUB_DEAD, CPUHP_DEBUG_OBJ_DEAD, CPUHP_MM_WRITEBACK_DEAD, + /* Must be after CPUHP_MM_VMSTAT_DEAD */ + CPUHP_MM_DEMOTION_DEAD, CPUHP_MM_VMSTAT_DEAD, CPUHP_SOFTIRQ_DEAD, CPUHP_NET_MVNETA_DEAD, @@ -240,6 +242,8 @@ enum cpuhp_state { CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, + /* Must be after CPUHP_AP_ONLINE_DYN for node_states[N_CPU] update */ + CPUHP_AP_MM_DEMOTION_ONLINE, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, CPUHP_AP_DTPM_CPU_ONLINE, -- cgit v1.2.3 From b0e901280d9860a0a35055f220e8e457f300f40a Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 18 Oct 2021 15:16:09 -0700 Subject: elfcore: correct reference to CONFIG_UML Commit 6e7b64b9dd6d ("elfcore: fix building with clang") introduces special handling for two architectures, ia64 and User Mode Linux. However, the wrong name, i.e., CONFIG_UM, for the intended Kconfig symbol for User-Mode Linux was used. Although the directory for User Mode Linux is ./arch/um; the Kconfig symbol for this architecture is called CONFIG_UML. Luckily, ./scripts/checkkconfigsymbols.py warns on non-existing configs: UM Referencing files: include/linux/elfcore.h Similar symbols: UML, NUMA Correct the name of the config to the intended one. [akpm@linux-foundation.org: fix um/x86_64, per Catalin] Link: https://lkml.kernel.org/r/20211006181119.2851441-1-catalin.marinas@arm.com Link: https://lkml.kernel.org/r/YV6pejGzLy5ppEpt@arm.com Link: https://lkml.kernel.org/r/20211006082209.417-1-lukas.bulwahn@gmail.com Fixes: 6e7b64b9dd6d ("elfcore: fix building with clang") Signed-off-by: Lukas Bulwahn Cc: Arnd Bergmann Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Catalin Marinas Cc: Barret Rhoden Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/elfcore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 2aaa15779d50..957ebec35aad 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -109,7 +109,7 @@ static inline int elf_core_copy_task_fpregs(struct task_struct *t, struct pt_reg #endif } -#if defined(CONFIG_UM) || defined(CONFIG_IA64) +#if (defined(CONFIG_UML) && defined(CONFIG_X86_32)) || defined(CONFIG_IA64) /* * These functions parameterize elf_core_dump in fs/binfmt_elf.c to write out * extra segments containing the gate DSO contents. Dumping its -- cgit v1.2.3 From 79f9bc5843142b649575f887dccdf1c07ad75c20 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 18 Oct 2021 15:16:16 -0700 Subject: mm/secretmem: fix NULL page->mapping dereference in page_is_secretmem() Check for a NULL page->mapping before dereferencing the mapping in page_is_secretmem(), as the page's mapping can be nullified while gup() is running, e.g. by reclaim or truncation. BUG: kernel NULL pointer dereference, address: 0000000000000068 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 6 PID: 4173897 Comm: CPU 3/KVM Tainted: G W RIP: 0010:internal_get_user_pages_fast+0x621/0x9d0 Code: <48> 81 7a 68 80 08 04 bc 0f 85 21 ff ff 8 89 c7 be RSP: 0018:ffffaa90087679b0 EFLAGS: 00010046 RAX: ffffe3f37905b900 RBX: 00007f2dd561e000 RCX: ffffe3f37905b934 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffe3f37905b900 ... CR2: 0000000000000068 CR3: 00000004c5898003 CR4: 00000000001726e0 Call Trace: get_user_pages_fast_only+0x13/0x20 hva_to_pfn+0xa9/0x3e0 try_async_pf+0xa1/0x270 direct_page_fault+0x113/0xad0 kvm_mmu_page_fault+0x69/0x680 vmx_handle_exit+0xe1/0x5d0 kvm_arch_vcpu_ioctl_run+0xd81/0x1c70 kvm_vcpu_ioctl+0x267/0x670 __x64_sys_ioctl+0x83/0xa0 do_syscall_64+0x56/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Link: https://lkml.kernel.org/r/20211007231502.3552715-1-seanjc@google.com Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas") Signed-off-by: Sean Christopherson Reported-by: Darrick J. Wong Reported-by: Stephen Tested-by: Darrick J. Wong Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/secretmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h index 21c3771e6a56..988528b5da43 100644 --- a/include/linux/secretmem.h +++ b/include/linux/secretmem.h @@ -23,7 +23,7 @@ static inline bool page_is_secretmem(struct page *page) mapping = (struct address_space *) ((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); - if (mapping != page->mapping) + if (!mapping || mapping != page->mapping) return false; return mapping->a_ops == &secretmem_aops; -- cgit v1.2.3 From 25c02edfd41f0dd7aad9115149625d7e7f441b7d Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Fri, 3 Sep 2021 10:29:13 +0300 Subject: iio: inkern: introduce devm_iio_map_array_register() short-hand function This change introduces a device-managed variant to the iio_map_array_register() function. It's a simple implementation of calling iio_map_array_register() and registering a callback to iio_map_array_unregister() with the devm_add_action_or_reset(). The function uses an explicit 'dev' parameter to bind the unwinding to. It could have been implemented to implicitly use the parent of the IIO device, however it shouldn't be too expensive to callers to just specify to which device object to bind this unwind call. It would make the API a bit more flexible. Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20210903072917.45769-2-aardelean@deviqon.com Signed-off-by: Jonathan Cameron --- include/linux/iio/driver.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/driver.h b/include/linux/iio/driver.h index 36de60a5da7a..7a157ed218f6 100644 --- a/include/linux/iio/driver.h +++ b/include/linux/iio/driver.h @@ -8,6 +8,7 @@ #ifndef _IIO_INKERN_H_ #define _IIO_INKERN_H_ +struct device; struct iio_dev; struct iio_map; @@ -26,4 +27,17 @@ int iio_map_array_register(struct iio_dev *indio_dev, */ int iio_map_array_unregister(struct iio_dev *indio_dev); +/** + * devm_iio_map_array_register - device-managed version of iio_map_array_register + * @dev: Device object to which to bind the unwinding of this registration + * @indio_dev: Pointer to the iio_dev structure + * @maps: Pointer to an IIO map object which is to be registered to this IIO device + * + * This function will call iio_map_array_register() to register an IIO map object + * and will also hook a callback to the iio_map_array_unregister() function to + * handle de-registration of the IIO map object when the device's refcount goes to + * zero. + */ +int devm_iio_map_array_register(struct device *dev, struct iio_dev *indio_dev, struct iio_map *maps); + #endif -- cgit v1.2.3 From 31fa357ac809affd9f9a7d0b5d1991951e16beec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Fri, 3 Sep 2021 16:14:20 +0200 Subject: iio: adis: handle devices that cannot unmask the drdy pin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some devices can't mask/unmask the data ready pin and in those cases each driver was just calling '{dis}enable_irq()' to control the trigger state. This change, moves that handling into the library by introducing a new boolean in the data structure that tells the library that the device cannot unmask the pin. On top of controlling the trigger state, we can also use this flag to automatically request the IRQ with 'IRQF_NO_AUTOEN' in case it is set. So far, all users of the library want to start operation with IRQs/DRDY pin disabled so it should be fairly safe to do this inside the library. Signed-off-by: Nuno SĂĄ Link: https://lore.kernel.org/r/20210903141423.517028-3-nuno.sa@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index cf49997d5903..7c02f5292eea 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -49,6 +49,7 @@ struct adis_timeout { * @status_error_mask: Bitmask of errors supported by the device * @timeouts: Chip specific delays * @enable_irq: Hook for ADIS devices that have a special IRQ enable/disable + * @unmasked_drdy: True for devices that cannot mask/unmask the data ready pin * @has_paging: True if ADIS device has paged registers * @burst_reg_cmd: Register command that triggers burst * @burst_len: Burst size in the SPI RX buffer. If @burst_max_len is defined, @@ -78,6 +79,7 @@ struct adis_data { unsigned int status_error_mask; int (*enable_irq)(struct adis *adis, bool enable); + bool unmasked_drdy; bool has_paging; -- cgit v1.2.3 From 95ec3fdf2b79eaff79e78688bbc2f7dbb98d68b6 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 13 Jun 2021 16:10:36 +0100 Subject: iio: core: Introduce iio_push_to_buffers_with_ts_unaligned() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Whilst it is almost always possible to arrange for scan data to be read directly into a buffer that is suitable for passing to iio_push_to_buffers_with_timestamp(), there are a few places where leading data needs to be skipped over. For these cases introduce a function that will allocate an appropriate sized and aligned bounce buffer (if not already allocated) and copy the unaligned data into that before calling iio_push_to_buffers_with_timestamp() on the bounce buffer. We tie the lifespace of this buffer to that of the iio_dev.dev which should ensure no memory leaks occur. Signed-off-by: Jonathan Cameron Reviewed-by: Nuno SĂĄ Link: https://lore.kernel.org/r/20210613151039.569883-2-jic23@kernel.org --- include/linux/iio/buffer.h | 4 ++++ include/linux/iio/iio-opaque.h | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index b6928ac5c63d..451379a3984a 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -38,6 +38,10 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev, return iio_push_to_buffers(indio_dev, data); } +int iio_push_to_buffers_with_ts_unaligned(struct iio_dev *indio_dev, + const void *data, size_t data_sz, + int64_t timestamp); + bool iio_validate_scan_mask_onehot(struct iio_dev *indio_dev, const unsigned long *mask); diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h index c9504e9da571..2be12b7b5dc5 100644 --- a/include/linux/iio/iio-opaque.h +++ b/include/linux/iio/iio-opaque.h @@ -23,6 +23,8 @@ * @groupcounter: index of next attribute group * @legacy_scan_el_group: attribute group for legacy scan elements attribute group * @legacy_buffer_group: attribute group for legacy buffer attributes group + * @bounce_buffer: for devices that call iio_push_to_buffers_with_timestamp_unaligned() + * @bounce_buffer_size: size of currently allocate bounce buffer * @scan_index_timestamp: cache of the index to the timestamp * @clock_id: timestamping clock posix identifier * @chrdev: associated character device @@ -50,6 +52,8 @@ struct iio_dev_opaque { int groupcounter; struct attribute_group legacy_scan_el_group; struct attribute_group legacy_buffer_group; + void *bounce_buffer; + size_t bounce_buffer_size; unsigned int scan_index_timestamp; clockid_t clock_id; -- cgit v1.2.3 From 9eeee3b0bf190b4f677af27e24ba0cd1c030e49b Mon Sep 17 00:00:00 2001 From: Mihail Chindris Date: Thu, 7 Oct 2021 08:00:30 +0000 Subject: iio: Add output buffer support Currently IIO only supports buffer mode for capture devices like ADCs. Add support for buffered mode for output devices like DACs. The output buffer implementation is analogous to the input buffer implementation. Instead of using read() to get data from the buffer write() is used to copy data into the buffer. poll() with POLLOUT will wakeup if there is space available. Drivers can remove data from a buffer using iio_pop_from_buffer(), the function can e.g. called from a trigger handler to write the data to hardware. A buffer can only be either a output buffer or an input, but not both. So, for a device that has an ADC and DAC path, this will mean 2 IIO buffers (one for each direction). The direction of the buffer is decided by the new direction field of the iio_buffer struct and should be set after allocating and before registering it. Co-developed-by: Lars-Peter Clausen Signed-off-by: Lars-Peter Clausen Co-developed-by: Alexandru Ardelean Signed-off-by: Alexandru Ardelean Signed-off-by: Mihail Chindris Link: https://lore.kernel.org/r/20211007080035.2531-2-mihail.chindris@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 7 +++++++ include/linux/iio/buffer_impl.h | 11 +++++++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index 451379a3984a..418b1307d3f2 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -11,8 +11,15 @@ struct iio_buffer; +enum iio_buffer_direction { + IIO_BUFFER_DIRECTION_IN, + IIO_BUFFER_DIRECTION_OUT, +}; + int iio_push_to_buffers(struct iio_dev *indio_dev, const void *data); +int iio_pop_from_buffer(struct iio_buffer *buffer, void *data); + /** * iio_push_to_buffers_with_timestamp() - push data and timestamp to buffers * @indio_dev: iio_dev structure for device. diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index 245b32918ae1..e2ca8ea23e19 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -7,6 +7,7 @@ #ifdef CONFIG_IIO_BUFFER #include +#include struct iio_dev; struct iio_buffer; @@ -23,6 +24,10 @@ struct iio_buffer; * @read: try to get a specified number of bytes (must exist) * @data_available: indicates how much data is available for reading from * the buffer. + * @remove_from: remove scan from buffer. Drivers should calls this to + * remove a scan from a buffer. + * @write: try to write a number of bytes + * @space_available: returns the amount of bytes available in a buffer * @request_update: if a parameter change has been marked, update underlying * storage. * @set_bytes_per_datum:set number of bytes per datum @@ -49,6 +54,9 @@ struct iio_buffer_access_funcs { int (*store_to)(struct iio_buffer *buffer, const void *data); int (*read)(struct iio_buffer *buffer, size_t n, char __user *buf); size_t (*data_available)(struct iio_buffer *buffer); + int (*remove_from)(struct iio_buffer *buffer, void *data); + int (*write)(struct iio_buffer *buffer, size_t n, const char __user *buf); + size_t (*space_available)(struct iio_buffer *buffer); int (*request_update)(struct iio_buffer *buffer); @@ -80,6 +88,9 @@ struct iio_buffer { /** @bytes_per_datum: Size of individual datum including timestamp. */ size_t bytes_per_datum; + /* @direction: Direction of the data stream (in/out). */ + enum iio_buffer_direction direction; + /** * @access: Buffer access functions associated with the * implementation. -- cgit v1.2.3 From c02cd5c19c17698f12b731e898127095f9bc2921 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 7 Oct 2021 08:00:32 +0000 Subject: iio: triggered-buffer: extend support to configure output buffers Now that output (kfifo) buffers are supported, we need to extend the {devm_}iio_triggered_buffer_setup_ext() parameter list to take a direction parameter. This allows us to attach an output triggered buffer to a DAC device. Unfortunately it's a bit difficult to add another macro to avoid changing 5 drivers where {devm_}iio_triggered_buffer_setup_ext() is used. Well, it's doable, but may not be worth the trouble vs just updating all these 5 drivers. Signed-off-by: Alexandru Ardelean Signed-off-by: Mihail Chindris Link: https://lore.kernel.org/r/20211007080035.2531-4-mihail.chindris@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/triggered_buffer.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/triggered_buffer.h b/include/linux/iio/triggered_buffer.h index 7f154d1f8739..7490b05fc5b2 100644 --- a/include/linux/iio/triggered_buffer.h +++ b/include/linux/iio/triggered_buffer.h @@ -2,6 +2,7 @@ #ifndef _LINUX_IIO_TRIGGERED_BUFFER_H_ #define _LINUX_IIO_TRIGGERED_BUFFER_H_ +#include #include struct attribute; @@ -11,21 +12,27 @@ struct iio_buffer_setup_ops; int iio_triggered_buffer_setup_ext(struct iio_dev *indio_dev, irqreturn_t (*h)(int irq, void *p), irqreturn_t (*thread)(int irq, void *p), + enum iio_buffer_direction direction, const struct iio_buffer_setup_ops *setup_ops, const struct attribute **buffer_attrs); void iio_triggered_buffer_cleanup(struct iio_dev *indio_dev); #define iio_triggered_buffer_setup(indio_dev, h, thread, setup_ops) \ - iio_triggered_buffer_setup_ext((indio_dev), (h), (thread), (setup_ops), NULL) + iio_triggered_buffer_setup_ext((indio_dev), (h), (thread), \ + IIO_BUFFER_DIRECTION_IN, (setup_ops), \ + NULL) int devm_iio_triggered_buffer_setup_ext(struct device *dev, struct iio_dev *indio_dev, irqreturn_t (*h)(int irq, void *p), irqreturn_t (*thread)(int irq, void *p), + enum iio_buffer_direction direction, const struct iio_buffer_setup_ops *ops, const struct attribute **buffer_attrs); #define devm_iio_triggered_buffer_setup(dev, indio_dev, h, thread, setup_ops) \ - devm_iio_triggered_buffer_setup_ext((dev), (indio_dev), (h), (thread), (setup_ops), NULL) + devm_iio_triggered_buffer_setup_ext((dev), (indio_dev), (h), (thread), \ + IIO_BUFFER_DIRECTION_IN, \ + (setup_ops), NULL) #endif -- cgit v1.2.3 From aafa1cafedca7c64d6a43cd21488d28d1d1be884 Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Sat, 2 Oct 2021 12:32:30 +0300 Subject: platform_data/mlxreg: Add new type to support modular systems Add new types for the Nvidia modular systems MSN4800 which could be equipped with the different types of replaceable line cards. Add new type to specify the kind of hotplug events for the line cards. The line card events are generated by the programmable device located on the main board. This device implements interrupt controller logic. Line card interrupts are associated with different line cards states during its initialization: insertion, security signature validation, power good state, security validation, hardware-firmware synchronization state, line card PHYs readiness state, firmware availability for line card ports. Also under some circumstances hardware can generate thermal shutdown for particular line card. Add new type specifying the action, which should be performed when particular hotplug event is received. This action defines in which way hotplug event should be handled by hotplug driver. There are the next actions types: - Connect I2C device with empty 'platform_data' field according to the platform topology, if device is configured (for example, power unit micro-controller driver, when power unit is connected to power source (this is what is currently supported). - Connect device with 'platform_data' field set according to the platform topology. The purpose is to pass 'platform_data' through hotplug driver to underlying device (for example line card driver). - No device is associated with hotplug event - just send "udev" event (this is what is currently supported). Extend structure 'mlxreg_hotplug_device' with hotplug action field. Extend structure 'mlxreg_core_data' with: - Registers for line card power and enabling control. - Slot number field, to indicate at which physical slot replaceable line card device is located. Signed-off-by: Vadim Pasternak Reviewed-by: Michael Shych Link: https://lore.kernel.org/r/20211002093238.3771419-2-vadimp@nvidia.com Signed-off-by: Hans de Goede --- include/linux/platform_data/mlxreg.h | 57 ++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h index 101333fe2b8d..49f0e15a10dd 100644 --- a/include/linux/platform_data/mlxreg.h +++ b/include/linux/platform_data/mlxreg.h @@ -24,6 +24,51 @@ enum mlxreg_wdt_type { MLX_WDT_TYPE3, }; +/** + * enum mlxreg_hotplug_kind - kind of hotplug entry + * + * @MLXREG_HOTPLUG_DEVICE_NA: do not care; + * @MLXREG_HOTPLUG_LC_PRESENT: entry for line card presence in/out events; + * @MLXREG_HOTPLUG_LC_VERIFIED: entry for line card verification status events + * coming after line card security signature validation; + * @MLXREG_HOTPLUG_LC_POWERED: entry for line card power on/off events; + * @MLXREG_HOTPLUG_LC_SYNCED: entry for line card synchronization events, coming + * after hardware-firmware synchronization handshake; + * @MLXREG_HOTPLUG_LC_READY: entry for line card ready events, indicating line card + PHYs ready / unready state; + * @MLXREG_HOTPLUG_LC_ACTIVE: entry for line card active events, indicating firmware + * availability / unavailability for the ports on line card; + * @MLXREG_HOTPLUG_LC_THERMAL: entry for line card thermal shutdown events, positive + * event indicates that system should power off the line + * card for which this event has been received; + */ +enum mlxreg_hotplug_kind { + MLXREG_HOTPLUG_DEVICE_NA = 0, + MLXREG_HOTPLUG_LC_PRESENT = 1, + MLXREG_HOTPLUG_LC_VERIFIED = 2, + MLXREG_HOTPLUG_LC_POWERED = 3, + MLXREG_HOTPLUG_LC_SYNCED = 4, + MLXREG_HOTPLUG_LC_READY = 5, + MLXREG_HOTPLUG_LC_ACTIVE = 6, + MLXREG_HOTPLUG_LC_THERMAL = 7, +}; + +/** + * enum mlxreg_hotplug_device_action - hotplug device action required for + * driver's connectivity + * + * @MLXREG_HOTPLUG_DEVICE_DEFAULT_ACTION: probe device for 'on' event, remove + * for 'off' event; + * @MLXREG_HOTPLUG_DEVICE_PLATFORM_ACTION: probe platform device for 'on' + * event, remove for 'off' event; + * @MLXREG_HOTPLUG_DEVICE_NO_ACTION: no connectivity action is required; + */ +enum mlxreg_hotplug_device_action { + MLXREG_HOTPLUG_DEVICE_DEFAULT_ACTION = 0, + MLXREG_HOTPLUG_DEVICE_PLATFORM_ACTION = 1, + MLXREG_HOTPLUG_DEVICE_NO_ACTION = 2, +}; + /** * struct mlxreg_hotplug_device - I2C device data: * @@ -31,6 +76,7 @@ enum mlxreg_wdt_type { * @client: I2C device client; * @brdinfo: device board information; * @nr: I2C device adapter number, to which device is to be attached; + * @action: action to be performed upon event receiving; * * Structure represents I2C hotplug device static data (board topology) and * dynamic data (related kernel objects handles). @@ -40,6 +86,7 @@ struct mlxreg_hotplug_device { struct i2c_client *client; struct i2c_board_info *brdinfo; int nr; + enum mlxreg_hotplug_device_action action; }; /** @@ -51,12 +98,16 @@ struct mlxreg_hotplug_device { * @bit: attribute effective bit; * @capability: attribute capability register; * @reg_prsnt: attribute presence register; + * @reg_sync: attribute synch register; + * @reg_pwr: attribute power register; + * @reg_ena: attribute enable register; * @mode: access mode; * @np - pointer to node platform associated with attribute; * @hpdev - hotplug device data; * @health_cntr: dynamic device health indication counter; * @attached: true if device has been attached after good health indication; * @regnum: number of registers occupied by multi-register attribute; + * @slot: slot number, at which device is located; */ struct mlxreg_core_data { char label[MLXREG_CORE_LABEL_MAX_SIZE]; @@ -65,18 +116,23 @@ struct mlxreg_core_data { u32 bit; u32 capability; u32 reg_prsnt; + u32 reg_sync; + u32 reg_pwr; + u32 reg_ena; umode_t mode; struct device_node *np; struct mlxreg_hotplug_device hpdev; u32 health_cntr; bool attached; u8 regnum; + u8 slot; }; /** * struct mlxreg_core_item - same type components controlled by the driver: * * @data: component data; + * @kind: kind of hotplug attribute; * @aggr_mask: group aggregation mask; * @reg: group interrupt status register; * @mask: group interrupt mask; @@ -89,6 +145,7 @@ struct mlxreg_core_data { */ struct mlxreg_core_item { struct mlxreg_core_data *data; + enum mlxreg_hotplug_kind kind; u32 aggr_mask; u32 reg; u32 mask; -- cgit v1.2.3 From bb1023b6da379985ff888dcd7bc601735d02267a Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Sat, 2 Oct 2021 12:32:32 +0300 Subject: platform/mellanox: mlxreg-hotplug: Extend logic for hotplug devices operations Extend the structure 'mlxreg_hotplug_device" with platform device field to allow transition of the register map and system interrupt line number to underlying hotplug devices, sharing the same register map and same interrupt line with 'mlxreg-hotplug' driver. Extend logic for hotplug devices creation and removing according to the action associated with the hotplug device description. Previously hotplug driver was capable to attach / de-attach upon hotplug events only I2C devices handled by simple I2C drivers. Now it should be able to attach also devices handled by the platform drivers. The motivation is to allow transition of platform data like: - system interrupt line number, sharing with 'mlxreg-hotplug' to underlying hotplug devices. - shared register map of programmable devices on main board to underlying hotplug devices. Additioanlly the number of 'sysfs' attributes is increased, since modular system defines more 'sysfs' attributes. Signed-off-by: Vadim Pasternak Reviewed-by: Michael Shych Link: https://lore.kernel.org/r/20211002093238.3771419-4-vadimp@nvidia.com Signed-off-by: Hans de Goede --- include/linux/platform_data/mlxreg.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h index 49f0e15a10dd..3122d550dc00 100644 --- a/include/linux/platform_data/mlxreg.h +++ b/include/linux/platform_data/mlxreg.h @@ -69,6 +69,19 @@ enum mlxreg_hotplug_device_action { MLXREG_HOTPLUG_DEVICE_NO_ACTION = 2, }; +/** + * struct mlxreg_core_hotplug_notifier - hotplug notifier block: + * + * @identity: notifier identity name; + * @handle: user handle to be passed by user handler function; + * @user_handler: user handler function associated with the event; + */ +struct mlxreg_core_hotplug_notifier { + char identity[MLXREG_CORE_LABEL_MAX_SIZE]; + void *handle; + int (*user_handler)(void *handle, enum mlxreg_hotplug_kind kind, u8 action); +}; + /** * struct mlxreg_hotplug_device - I2C device data: * @@ -76,7 +89,11 @@ enum mlxreg_hotplug_device_action { * @client: I2C device client; * @brdinfo: device board information; * @nr: I2C device adapter number, to which device is to be attached; + * @pdev: platform device, if device is instantiated as a platform device; * @action: action to be performed upon event receiving; + * @handle: user handle to be passed by user handler function; + * @user_handler: user handler function associated with the event; + * @notifier: pointer to event notifier block; * * Structure represents I2C hotplug device static data (board topology) and * dynamic data (related kernel objects handles). @@ -86,7 +103,11 @@ struct mlxreg_hotplug_device { struct i2c_client *client; struct i2c_board_info *brdinfo; int nr; + struct platform_device *pdev; enum mlxreg_hotplug_device_action action; + void *handle; + int (*user_handler)(void *handle, enum mlxreg_hotplug_kind kind, u8 action); + struct mlxreg_core_hotplug_notifier *notifier; }; /** @@ -104,10 +125,12 @@ struct mlxreg_hotplug_device { * @mode: access mode; * @np - pointer to node platform associated with attribute; * @hpdev - hotplug device data; + * @notifier: pointer to event notifier block; * @health_cntr: dynamic device health indication counter; * @attached: true if device has been attached after good health indication; * @regnum: number of registers occupied by multi-register attribute; * @slot: slot number, at which device is located; + * @secured: if set indicates that entry access is secured; */ struct mlxreg_core_data { char label[MLXREG_CORE_LABEL_MAX_SIZE]; @@ -122,6 +145,7 @@ struct mlxreg_core_data { umode_t mode; struct device_node *np; struct mlxreg_hotplug_device hpdev; + struct mlxreg_core_hotplug_notifier *notifier; u32 health_cntr; bool attached; u8 regnum; -- cgit v1.2.3 From 9d93d7877c9158d059028681b66a0c192f85c64d Mon Sep 17 00:00:00 2001 From: Vadim Pasternak Date: Sat, 2 Oct 2021 12:32:35 +0300 Subject: platform_data/mlxreg: Add new field for secured access Extend structure 'mlxreg_core_data' with the field "secured". The purpose of this field is to restrict access to some attributes, if kernel is configured with security options, like: LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY. Access to some attributes, which for example, allow burning of some hardware components, like FPGA, CPLD, SPI, etcetera can break the system. In case user does not want to allow such access, it can disable it by setting security options. Signed-off-by: Vadim Pasternak Reviewed-by: Michael Shych Link: https://lore.kernel.org/r/20211002093238.3771419-7-vadimp@nvidia.com Signed-off-by: Hans de Goede --- include/linux/platform_data/mlxreg.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h index 3122d550dc00..40185f9d7c14 100644 --- a/include/linux/platform_data/mlxreg.h +++ b/include/linux/platform_data/mlxreg.h @@ -150,6 +150,7 @@ struct mlxreg_core_data { bool attached; u8 regnum; u8 slot; + u8 secured; }; /** -- cgit v1.2.3 From 54f5b3615f199673ecb23f0a6847fd9c43aaa012 Mon Sep 17 00:00:00 2001 From: Wang Kefeng Date: Mon, 23 Aug 2021 10:41:41 +0100 Subject: ARM: 9121/1: amba: Drop unused functions about APB/AHB devices add No one use the following functions, kill them. amba_aphb_device_add() amba_apb_device_add() amba_apb_device_add_res() amba_ahb_device_add() amba_ahb_device_add_res() Cc: Linus Walleij Reviewed-by: Rob Herring Signed-off-by: Kefeng Wang Signed-off-by: Russell King (Oracle) --- include/linux/amba/bus.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index c68d87b87283..edfcf7a14dcd 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -122,24 +122,6 @@ struct amba_device *amba_device_alloc(const char *, resource_size_t, size_t); void amba_device_put(struct amba_device *); int amba_device_add(struct amba_device *, struct resource *); int amba_device_register(struct amba_device *, struct resource *); -struct amba_device *amba_apb_device_add(struct device *parent, const char *name, - resource_size_t base, size_t size, - int irq1, int irq2, void *pdata, - unsigned int periphid); -struct amba_device *amba_ahb_device_add(struct device *parent, const char *name, - resource_size_t base, size_t size, - int irq1, int irq2, void *pdata, - unsigned int periphid); -struct amba_device * -amba_apb_device_add_res(struct device *parent, const char *name, - resource_size_t base, size_t size, int irq1, - int irq2, void *pdata, unsigned int periphid, - struct resource *resbase); -struct amba_device * -amba_ahb_device_add_res(struct device *parent, const char *name, - resource_size_t base, size_t size, int irq1, - int irq2, void *pdata, unsigned int periphid, - struct resource *resbase); void amba_device_unregister(struct amba_device *); struct amba_device *amba_find_device(const char *, struct device *, unsigned int, unsigned int); int amba_request_regions(struct amba_device *, const char *); -- cgit v1.2.3 From f5245a5fdf757f50a6c905fc16cceb1a6146ccf5 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Sun, 17 Oct 2021 13:55:21 -0500 Subject: counter: drop chrdev_lock This removes the chrdev_lock from the counter subsystem. This was intended to prevent opening the chrdev more than once. However, this doesn't work in practice since userspace can duplicate file descriptors and pass file descriptors to other processes. Since this protection can't be relied on, it is best to just remove it. Suggested-by: Greg KH Acked-by: William Breathitt Gray Signed-off-by: David Lechner Link: https://lore.kernel.org/r/20211017185521.3468640-1-david@lechnology.com Signed-off-by: Greg Kroah-Hartman --- include/linux/counter.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index 22b14a552b1d..0fd99e255a50 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -297,7 +297,6 @@ struct counter_ops { * @events: queue of detected Counter events * @events_wait: wait queue to allow blocking reads of Counter events * @events_lock: lock to protect Counter events queue read operations - * @chrdev_lock: lock to limit chrdev to a single open at a time * @ops_exist_lock: lock to prevent use during removal */ struct counter_device { @@ -325,12 +324,6 @@ struct counter_device { DECLARE_KFIFO_PTR(events, struct counter_event); wait_queue_head_t events_wait; struct mutex events_lock; - /* - * chrdev_lock is locked by counter_chrdev_open() and unlocked by - * counter_chrdev_release(), so a mutex is not possible here because - * chrdev_lock will invariably be held when returning to user space - */ - atomic_t chrdev_lock; struct mutex ops_exist_lock; }; -- cgit v1.2.3 From 5c67aa59bd8f99d70c81b5e71bd02083056a8486 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 14 Oct 2021 16:26:11 +0300 Subject: mmc: sdhci-pci: Remove dead code (struct sdhci_pci_data et al) The last user of this struct gone a couple of releases ago. Besides that there were not so many users of this API for more than 10 years: 1/ The one is Intel Merrifield, that had been added 2016-08-31 by the commit 3976b0380b31 ("x86/platform/intel-mid: Enable SD card detection on Merrifield") and removed 2021-02-11 by the commit 4590d98f5a4f ("sfi: Remove framework for deprecated firmware"). 2/ The other is Intel Sunrisepoint related, that had been added 2015-02-06 by the commit e1bfad6d936d ("mmc: sdhci-pci: Add support for drive strength selection for SPT") and removed 2017-03-20 by the commit 51ced59cc02e ("mmc: sdhci-pci: Use ACPI DSM to get driver strength for some Intel devices"). Effectively this is a revert of the commit 52c506f0bc72 ("mmc: sdhci-pci: add platform data"). Signed-off-by: Andy Shevchenko Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20211014132613.27861-4-andriy.shevchenko@linux.intel.com Signed-off-by: Ulf Hansson --- include/linux/mmc/sdhci-pci-data.h | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 include/linux/mmc/sdhci-pci-data.h (limited to 'include/linux') diff --git a/include/linux/mmc/sdhci-pci-data.h b/include/linux/mmc/sdhci-pci-data.h deleted file mode 100644 index 1d42872d22f3..000000000000 --- a/include/linux/mmc/sdhci-pci-data.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef LINUX_MMC_SDHCI_PCI_DATA_H -#define LINUX_MMC_SDHCI_PCI_DATA_H - -struct pci_dev; - -struct sdhci_pci_data { - struct pci_dev *pdev; - int slotno; - int rst_n_gpio; /* Set to -EINVAL if unused */ - int cd_gpio; /* Set to -EINVAL if unused */ - int (*setup)(struct sdhci_pci_data *data); - void (*cleanup)(struct sdhci_pci_data *data); -}; - -extern struct sdhci_pci_data *(*sdhci_pci_get_data)(struct pci_dev *pdev, - int slotno); -#endif -- cgit v1.2.3 From cf6a8b1b24d675afc35a01cccd081160014a0125 Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Tue, 12 Oct 2021 13:26:30 +0300 Subject: RDMA/mlx5: Remove iova from struct mlx5_core_mkey iova is already stored in ibmr->iova, no need to store it here. Signed-off-by: Aharon Landau Reviewed-by: Shay Drory Acked-by: Michael S. Tsirkin Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e23417424373..669904f9986e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -364,7 +364,6 @@ enum { }; struct mlx5_core_mkey { - u64 iova; u64 size; u32 key; u32 pd; -- cgit v1.2.3 From 062fd731e51ee29ba745b2fd1c7ac87dd460d4ca Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Tue, 12 Oct 2021 13:26:31 +0300 Subject: RDMA/mlx5: Remove size from struct mlx5_core_mkey mkey->size is already stored in ibmr->length, no need to store it here. Signed-off-by: Aharon Landau Reviewed-by: Shay Drory Acked-by: Michael S. Tsirkin Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 669904f9986e..ff1e991314e2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -364,7 +364,6 @@ enum { }; struct mlx5_core_mkey { - u64 size; u32 key; u32 pd; u32 type; -- cgit v1.2.3 From c64674168b6a2f293e92caf33c917ccf10886801 Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Tue, 12 Oct 2021 13:26:32 +0300 Subject: RDMA/mlx5: Remove pd from struct mlx5_core_mkey There is no read of mkey->pd, only write. Remove it. Signed-off-by: Aharon Landau Reviewed-by: Shay Drory Acked-by: Michael S. Tsirkin Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ff1e991314e2..f0ce7d4dc4ff 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -365,7 +365,6 @@ enum { struct mlx5_core_mkey { u32 key; - u32 pd; u32 type; struct wait_queue_head wait; refcount_t usecount; -- cgit v1.2.3 From 83fec3f12a5904b62330fd1a89af6d892afc387e Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Tue, 12 Oct 2021 13:26:33 +0300 Subject: RDMA/mlx5: Replace struct mlx5_core_mkey by u32 key In mlx5_core and vdpa there is no use of mlx5_core_mkey members except for the key itself. As preparation for moving mlx5_core_mkey to mlx5_ib, the occurrences of struct mlx5_core_mkey in all modules except for mlx5_ib are replaced by a u32 key. Signed-off-by: Aharon Landau Reviewed-by: Shay Drory Acked-by: Michael S. Tsirkin Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f0ce7d4dc4ff..bd99f713720b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -650,7 +650,7 @@ struct mlx5e_resources { struct mlx5e_hw_objs { u32 pdn; struct mlx5_td td; - struct mlx5_core_mkey mkey; + u32 mkey; struct mlx5_sq_bfreg bfreg; } hw_objs; struct devlink_port dl_port; @@ -1021,13 +1021,11 @@ struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev, gfp_t flags, int npages); void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev, struct mlx5_cmd_mailbox *head); -int mlx5_core_create_mkey(struct mlx5_core_dev *dev, - struct mlx5_core_mkey *mkey, - u32 *in, int inlen); -int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, - struct mlx5_core_mkey *mkey); -int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, - u32 *out, int outlen); +int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in, + int inlen); +int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, u32 mkey); +int mlx5_core_query_mkey(struct mlx5_core_dev *dev, u32 mkey, u32 *out, + int outlen); int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn); int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn); int mlx5_pagealloc_init(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 4123bfb0b28b77b944360be8c758b1a0974e96ad Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Tue, 12 Oct 2021 13:26:34 +0300 Subject: RDMA/mlx5: Move struct mlx5_core_mkey to mlx5_ib Move mlx5_core_mkey struct to mlx5_ib, as the mlx5_core doesn't use it at this point. Signed-off-by: Aharon Landau Reviewed-by: Shay Drory Acked-by: Michael S. Tsirkin Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index bd99f713720b..70b6aff4940a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -357,19 +357,6 @@ struct mlx5_core_sig_ctx { u32 sigerr_count; }; -enum { - MLX5_MKEY_MR = 1, - MLX5_MKEY_MW, - MLX5_MKEY_INDIRECT_DEVX, -}; - -struct mlx5_core_mkey { - u32 key; - u32 type; - struct wait_queue_head wait; - refcount_t usecount; -}; - #define MLX5_24BIT_MASK ((1 << 24) - 1) enum mlx5_res_type { -- cgit v1.2.3 From e80094a473eefad9d856ce3ab0d7afdbb64800c4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 18 Oct 2021 14:10:02 -0700 Subject: ethernet: add a helper for assigning port addresses We have 5 drivers which offset base MAC addr by port id. Create a helper for them. This helper takes care of overflows, which some drivers did not do, please complain if that's going to break anything! Signed-off-by: Jakub Kicinski Reviewed-by: Vladimir Oltean Reviewed-by: Shannon Nelson Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 23681c3d3b8a..2ad71cc90b37 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -551,6 +551,27 @@ static inline unsigned long compare_ether_header(const void *a, const void *b) #endif } +/** + * eth_hw_addr_gen - Generate and assign Ethernet address to a port + * @dev: pointer to port's net_device structure + * @base_addr: base Ethernet address + * @id: offset to add to the base address + * + * Generate a MAC address using a base address and an offset and assign it + * to a net_device. Commonly used by switch drivers which need to compute + * addresses for all their ports. addr_assign_type is not changed. + */ +static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr, + unsigned int id) +{ + u64 u = ether_addr_to_u64(base_addr); + u8 addr[ETH_ALEN]; + + u += id; + u64_to_ether_addr(u, addr); + eth_hw_addr_set(dev, addr); +} + /** * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame * @skb: Buffer to pad -- cgit v1.2.3 From db9a02baa23267c695a44234a0f2f4607992780e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 6 Oct 2021 06:15:04 -0600 Subject: block: move bdev_read_only() into the header This is called for every write in the fast path, move it inline next to get_disk_ro() which is called internally. Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- include/linux/genhd.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index cd4038fd5743..c70bc5fce4db 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -221,6 +221,11 @@ static inline int get_disk_ro(struct gendisk *disk) test_bit(GD_READ_ONLY, &disk->state); } +static inline int bdev_read_only(struct block_device *bdev) +{ + return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); +} + extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); -- cgit v1.2.3 From e028f167eca5fc56938e6c1680d40eed0bc39e80 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Oct 2021 16:38:14 -0600 Subject: block: move blk_mq_tag_to_rq() inline This is in the fast path of driver issue or completion, and it's a single array index operation. Move it inline to avoid a function call for it. This does mean making struct blk_mq_tags block layer public, but there's not really much in there. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 656fe34bdb6c..6cf35de151a9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -7,6 +7,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -675,7 +676,40 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, struct request *blk_mq_alloc_request_hctx(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx); -struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); + +/* + * Tag address space map. + */ +struct blk_mq_tags { + unsigned int nr_tags; + unsigned int nr_reserved_tags; + + atomic_t active_queues; + + struct sbitmap_queue bitmap_tags; + struct sbitmap_queue breserved_tags; + + struct request **rqs; + struct request **static_rqs; + struct list_head page_list; + + /* + * used to clear request reference in rqs[] before freeing one + * request pool + */ + spinlock_t lock; +}; + +static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, + unsigned int tag) +{ + if (tag < tags->nr_tags) { + prefetch(tags->rqs[tag]); + return tags->rqs[tag]; + } + + return NULL; +} enum { BLK_MQ_UNIQUE_TAG_BITS = 16, -- cgit v1.2.3 From bc490f81731e181b07b8d7577425c06ae91692c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Oct 2021 10:12:12 -0600 Subject: block: change plugging to use a singly linked list Use a singly linked list for the blk_plug. This saves 8 bytes in the blk_plug struct, and makes for faster list manipulations than doubly linked lists. As we don't use the doubly linked lists for anything, singly linked is just fine. This yields a bump in default (merging enabled) performance from 7.0 to 7.1M IOPS, and ~7.5M IOPS with merging disabled. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fd9771a1da09..4027112b9851 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -728,7 +728,7 @@ extern void blk_set_queue_dying(struct request_queue *); * schedule() where blk_schedule_flush_plug() is called. */ struct blk_plug { - struct list_head mq_list; /* blk-mq requests */ + struct request *mq_list; /* blk-mq requests */ /* if ios_left is > 1, we can batch tag/rq allocations */ struct request *cached_rq; @@ -777,8 +777,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) struct blk_plug *plug = tsk->plug; return plug && - (!list_empty(&plug->mq_list) || - !list_empty(&plug->cb_list)); + (plug->mq_list || !list_empty(&plug->cb_list)); } int blkdev_issue_flush(struct block_device *bdev); -- cgit v1.2.3 From dc5fc361d891e089dfd9c0a975dc78041036b906 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Oct 2021 06:02:30 -0600 Subject: block: attempt direct issue of plug list If we have just one queue type in the plug list, then we can extend our direct issue to cover a full plug list as well. This allows sending a batch of requests for direct issue, which is more efficient than doing one-at-a-time kind of issue. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4027112b9851..f13091d3d476 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -737,6 +737,7 @@ struct blk_plug { unsigned short rq_count; bool multiple_queues; + bool has_elevator; bool nowait; struct list_head cb_list; /* md requires an unplug callback */ -- cgit v1.2.3 From cd45c9bf8b43cd387e167cf166ae5c517f56d658 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 18 Oct 2021 16:33:22 +0200 Subject: ASoC: Intel: Move soc_intel_is_foo() helpers to a generic header The soc_intel_is_foo() helpers from sound/soc/intel/common/soc-intel-quirks.h are useful outside of the sound subsystem too. Move these to include/linux/platform_data/x86/soc.h, so that other code can use them too. Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Acked-by: Mark Brown Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20211018143324.296961-2-hdegoede@redhat.com --- include/linux/platform_data/x86/soc.h | 65 +++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 include/linux/platform_data/x86/soc.h (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/soc.h b/include/linux/platform_data/x86/soc.h new file mode 100644 index 000000000000..da05f425587a --- /dev/null +++ b/include/linux/platform_data/x86/soc.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Helpers for Intel SoC model detection + * + * Copyright (c) 2019, Intel Corporation. + */ + +#ifndef __PLATFORM_DATA_X86_SOC_H +#define __PLATFORM_DATA_X86_SOC_H + +#if IS_ENABLED(CONFIG_X86) + +#include +#include + +#define SOC_INTEL_IS_CPU(soc, type) \ +static inline bool soc_intel_is_##soc(void) \ +{ \ + static const struct x86_cpu_id soc##_cpu_ids[] = { \ + X86_MATCH_INTEL_FAM6_MODEL(type, NULL), \ + {} \ + }; \ + const struct x86_cpu_id *id; \ + \ + id = x86_match_cpu(soc##_cpu_ids); \ + if (id) \ + return true; \ + return false; \ +} + +SOC_INTEL_IS_CPU(byt, ATOM_SILVERMONT); +SOC_INTEL_IS_CPU(cht, ATOM_AIRMONT); +SOC_INTEL_IS_CPU(apl, ATOM_GOLDMONT); +SOC_INTEL_IS_CPU(glk, ATOM_GOLDMONT_PLUS); +SOC_INTEL_IS_CPU(cml, KABYLAKE_L); + +#else /* IS_ENABLED(CONFIG_X86) */ + +static inline bool soc_intel_is_byt(void) +{ + return false; +} + +static inline bool soc_intel_is_cht(void) +{ + return false; +} + +static inline bool soc_intel_is_apl(void) +{ + return false; +} + +static inline bool soc_intel_is_glk(void) +{ + return false; +} + +static inline bool soc_intel_is_cml(void) +{ + return false; +} +#endif /* IS_ENABLED(CONFIG_X86) */ + +#endif /* __PLATFORM_DATA_X86_SOC_H */ -- cgit v1.2.3 From aaa2975f2b07b04ee16b2cad1072cbdea3e1c50a Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 11 Oct 2021 05:31:42 +0800 Subject: lib/xz: Add MicroLZMA decoder MicroLZMA is a yet another header format variant where the first byte of a raw LZMA stream (without the end of stream marker) has been replaced with a bitwise-negation of the lc/lp/pb properties byte. MicroLZMA was created to be used in EROFS but can be used by other things too where wasting minimal amount of space for headers is important. This is implemented using most of the LZMA2 code as is so the amount of new code is small. The API has a few extra features compared to the XZ decoder. On the other hand, the API lacks XZ_BUF_ERROR support which is important to take into account when using this API. MicroLZMA doesn't support BCJ filters. In theory they could be added later as there are many unused/reserved values for the first byte of the compressed stream but in practice it is somewhat unlikely to happen due to a few implementation reasons. Link: https://lore.kernel.org/r/20211010213145.17462-5-xiang@kernel.org Signed-off-by: Lasse Collin Signed-off-by: Gao Xiang --- include/linux/xz.h | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xz.h b/include/linux/xz.h index 9884c8440188..7285ca5d56e9 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -233,6 +233,112 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s); */ XZ_EXTERN void xz_dec_end(struct xz_dec *s); +/* + * Decompressor for MicroLZMA, an LZMA variant with a very minimal header. + * See xz_dec_microlzma_alloc() below for details. + * + * These functions aren't used or available in preboot code and thus aren't + * marked with XZ_EXTERN. This avoids warnings about static functions that + * are never defined. + */ +/** + * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state + */ +struct xz_dec_microlzma; + +/** + * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder + * @mode XZ_SINGLE or XZ_PREALLOC + * @dict_size LZMA dictionary size. This must be at least 4 KiB and + * at most 3 GiB. + * + * In contrast to xz_dec_init(), this function only allocates the memory + * and remembers the dictionary size. xz_dec_microlzma_reset() must be used + * before calling xz_dec_microlzma_run(). + * + * The amount of allocated memory is a little less than 30 KiB with XZ_SINGLE. + * With XZ_PREALLOC also a dictionary buffer of dict_size bytes is allocated. + * + * On success, xz_dec_microlzma_alloc() returns a pointer to + * struct xz_dec_microlzma. If memory allocation fails or + * dict_size is invalid, NULL is returned. + * + * The compressed format supported by this decoder is a raw LZMA stream + * whose first byte (always 0x00) has been replaced with bitwise-negation + * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is + * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00. + * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream + * marker must not be used. The unused values are reserved for future use. + * This MicroLZMA header format was created for use in EROFS but may be used + * by others too. + */ +extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, + uint32_t dict_size); + +/** + * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state + * @s Decoder state allocated using xz_dec_microlzma_alloc() + * @comp_size Compressed size of the input stream + * @uncomp_size Uncompressed size of the input stream. A value smaller + * than the real uncompressed size of the input stream can + * be specified if uncomp_size_is_exact is set to false. + * uncomp_size can never be set to a value larger than the + * expected real uncompressed size because it would eventually + * result in XZ_DATA_ERROR. + * @uncomp_size_is_exact This is an int instead of bool to avoid + * requiring stdbool.h. This should normally be set to true. + * When this is set to false, error detection is weaker. + */ +extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, + uint32_t comp_size, uint32_t uncomp_size, + int uncomp_size_is_exact); + +/** + * xz_dec_microlzma_run() - Run the MicroLZMA decoder + * @s Decoder state initialized using xz_dec_microlzma_reset() + * @b: Input and output buffers + * + * This works similarly to xz_dec_run() with a few important differences. + * Only the differences are documented here. + * + * The only possible return values are XZ_OK, XZ_STREAM_END, and + * XZ_DATA_ERROR. This function cannot return XZ_BUF_ERROR: if no progress + * is possible due to lack of input data or output space, this function will + * keep returning XZ_OK. Thus, the calling code must be written so that it + * will eventually provide input and output space matching (or exceeding) + * comp_size and uncomp_size arguments given to xz_dec_microlzma_reset(). + * If the caller cannot do this (for example, if the input file is truncated + * or otherwise corrupt), the caller must detect this error by itself to + * avoid an infinite loop. + * + * If the compressed data seems to be corrupt, XZ_DATA_ERROR is returned. + * This can happen also when incorrect dictionary, uncompressed, or + * compressed sizes have been specified. + * + * With XZ_PREALLOC only: As an extra feature, b->out may be NULL to skip over + * uncompressed data. This way the caller doesn't need to provide a temporary + * output buffer for the bytes that will be ignored. + * + * With XZ_SINGLE only: In contrast to xz_dec_run(), the return value XZ_OK + * is also possible and thus XZ_SINGLE is actually a limited multi-call mode. + * After XZ_OK the bytes decoded so far may be read from the output buffer. + * It is possible to continue decoding but the variables b->out and b->out_pos + * MUST NOT be changed by the caller. Increasing the value of b->out_size is + * allowed to make more output space available; one doesn't need to provide + * space for the whole uncompressed data on the first call. The input buffer + * may be changed normally like with XZ_PREALLOC. This way input data can be + * provided from non-contiguous memory. + */ +extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s, + struct xz_buf *b); + +/** + * xz_dec_microlzma_end() - Free the memory allocated for the decoder state + * @s: Decoder state allocated using xz_dec_microlzma_alloc(). + * If s is NULL, this function does nothing. + */ +extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s); + /* * Standalone build (userspace build or in-kernel build for boot time use) * needs a CRC32 implementation. For normal in-kernel use, kernel's own -- cgit v1.2.3 From e70feb8b3e6886c525c88943b5f1508d02f5a683 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 14 Oct 2021 16:17:10 +0800 Subject: blk-mq: support concurrent queue quiesce/unquiesce blk_mq_quiesce_queue() has been used a bit wide now, so far we don't support concurrent/nested quiesce. One biggest issue is that unquiesce can happen unexpectedly in case that quiesce/unquiesce are run concurrently from more than one context. This patch introduces q->mq_quiesce_depth to deal concurrent quiesce, and we only unquiesce queue when it is the last/outer-most one of all contexts. Several kernel panic issue has been reported[1][2][3] when running stress quiesce test. And this patch has been verified in these reports. [1] https://lore.kernel.org/linux-block/9b21c797-e505-3821-4f5b-df7bf9380328@huawei.com/T/#m1fc52431fad7f33b1ffc3f12c4450e4238540787 [2] https://lore.kernel.org/linux-block/9b21c797-e505-3821-4f5b-df7bf9380328@huawei.com/T/#m10ad90afeb9c8cc318334190a7c24c8b5c5e0722 [3] https://listman.redhat.com/archives/dm-devel/2021-September/msg00189.html Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211014081710.1871747-7-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f13091d3d476..2b22fa36e568 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -315,6 +315,8 @@ struct request_queue { */ struct mutex mq_freeze_lock; + int quiesce_depth; + struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct bio_set bio_split; -- cgit v1.2.3 From 34cdd18b8d245f3e901e5325313c27de727ab80d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 17 Jun 2020 16:56:16 -0400 Subject: tracing: Use linker magic instead of recasting ftrace_ops_list_func() In an effort to enable -Wcast-function-type in the top-level Makefile to support Control Flow Integrity builds, all function casts need to be removed. This means that ftrace_ops_list_func() can no longer be defined as ftrace_ops_no_ops(). The reason for ftrace_ops_no_ops() is to use that when an architecture calls ftrace_ops_list_func() with only two parameters (called from assembly). And to make sure there's no C side-effects, those archs call ftrace_ops_no_ops() which only has two parameters, as ftrace_ops_list_func() has four parameters. Instead of a typecast, use vmlinux.lds.h to define ftrace_ops_list_func() to arch_ftrace_ops_list_func() that will define the proper set of parameters. Link: https://lore.kernel.org/r/20200614070154.6039-1-oscar.carter@gmx.com Link: https://lkml.kernel.org/r/20200617165616.52241bde@oasis.local.home Link: https://lore.kernel.org/all/20211005053922.GA702049@embeddedor/ Requested-by: Oscar Carter Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 832e65f06754..12fcfa2d23ea 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -30,16 +30,26 @@ #define ARCH_SUPPORTS_FTRACE_OPS 0 #endif +#ifdef CONFIG_FUNCTION_TRACER +struct ftrace_ops; +struct ftrace_regs; /* * If the arch's mcount caller does not support all of ftrace's * features, then it must call an indirect function that * does. Or at least does enough to prevent any unwelcome side effects. + * + * Also define the function prototype that these architectures use + * to call the ftrace_ops_list_func(). */ #if !ARCH_SUPPORTS_FTRACE_OPS # define FTRACE_FORCE_LIST_FUNC 1 +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); #else # define FTRACE_FORCE_LIST_FUNC 0 +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); #endif +#endif /* CONFIG_FUNCTION_TRACER */ /* Main tracing buffer and events set up */ #ifdef CONFIG_TRACING @@ -88,8 +98,6 @@ extern int ftrace_enable_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -struct ftrace_ops; - #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS struct ftrace_regs { -- cgit v1.2.3 From 9b84fadc444de5456ab5f5487e2108311c724c3f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Oct 2021 13:42:40 -0400 Subject: tracing: Reuse logic from perf's get_recursion_context() Instead of having branches that adds noise to the branch prediction, use the addition logic to set the bit for the level of interrupt context that the state is currently in. This copies the logic from perf's get_recursion_context() function. Link: https://lore.kernel.org/all/20211015161702.GF174703@worktop.programming.kicks-ass.net/ Suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_recursion.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index a9f9c5714e65..f6da7a03bff0 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -137,12 +137,13 @@ enum { static __always_inline int trace_get_context_bit(void) { unsigned long pc = preempt_count(); + unsigned char bit = 0; - if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) - return TRACE_CTX_NORMAL; - else - return pc & NMI_MASK ? TRACE_CTX_NMI : - pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ; + bit += !!(pc & (NMI_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + return TRACE_CTX_NORMAL - bit; } #ifdef CONFIG_FTRACE_RECORD_RECURSION -- cgit v1.2.3 From 91ebe8bcbff9d2ff21303e73bf7434f39a98b255 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Oct 2021 15:01:19 -0400 Subject: tracing/perf: Add interrupt_context_level() helper Now that there are three different instances of doing the addition trick to the preempt_count() and NMI_MASK, HARDIRQ_MASK and SOFTIRQ_OFFSET macros, it deserves a helper function defined in the preempt.h header. Add the interrupt_context_level() helper and replace the three instances that do that logic with it. Link: https://lore.kernel.org/all/20211015142541.4badd8a9@gandalf.local.home/ Signed-off-by: Steven Rostedt (VMware) --- include/linux/preempt.h | 21 +++++++++++++++++++++ include/linux/trace_recursion.h | 7 +------ 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4d244e295e85..b32e3dabe28b 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -77,6 +77,27 @@ /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include +/** + * interrupt_context_level - return interrupt context level + * + * Returns the current interrupt context level. + * 0 - normal context + * 1 - softirq context + * 2 - hardirq context + * 3 - NMI context + */ +static __always_inline unsigned char interrupt_context_level(void) +{ + unsigned long pc = preempt_count(); + unsigned char level = 0; + + level += !!(pc & (NMI_MASK)); + level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + return level; +} + #define nmi_count() (preempt_count() & NMI_MASK) #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #ifdef CONFIG_PREEMPT_RT diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index f6da7a03bff0..1d8cce02c3fb 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -136,12 +136,7 @@ enum { static __always_inline int trace_get_context_bit(void) { - unsigned long pc = preempt_count(); - unsigned char bit = 0; - - bit += !!(pc & (NMI_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + unsigned char bit = interrupt_context_level(); return TRACE_CTX_NORMAL - bit; } -- cgit v1.2.3 From 15bf32398ad488c0df1cbaf16431422c87e4feea Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 12 Oct 2021 09:23:07 -0400 Subject: security: Return xattr name from security_dentry_init_security() Right now security_dentry_init_security() only supports single security label and is used by SELinux only. There are two users of this hook, namely ceph and nfs. NFS does not care about xattr name. Ceph hardcodes the xattr name to security.selinux (XATTR_NAME_SELINUX). I am making changes to fuse/virtiofs to send security label to virtiofsd and I need to send xattr name as well. I also hardcoded the name of xattr to security.selinux. Stephen Smalley suggested that it probably is a good idea to modify security_dentry_init_security() to also return name of xattr so that we can avoid this hardcoding in the callers. This patch adds a new parameter "const char **xattr_name" to security_dentry_init_security() and LSM puts the name of xattr too if caller asked for it (xattr_name != NULL). Signed-off-by: Vivek Goyal Reviewed-by: Jeff Layton Reviewed-by: Christian Brauner Acked-by: James Morris [PM: fixed typos in the commit description] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 3 ++- include/linux/lsm_hooks.h | 3 +++ include/linux/security.h | 6 ++++-- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 4c7ed0268ce3..a9ac70ae01ab 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -83,7 +83,8 @@ LSM_HOOK(int, 0, sb_add_mnt_opt, const char *option, const char *val, LSM_HOOK(int, 0, move_mount, const struct path *from_path, const struct path *to_path) LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry, - int mode, const struct qstr *name, void **ctx, u32 *ctxlen) + int mode, const struct qstr *name, const char **xattr_name, + void **ctx, u32 *ctxlen) LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 528554e9b90c..0bada4df23fc 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -196,6 +196,9 @@ * @dentry dentry to use in calculating the context. * @mode mode used to determine resource type. * @name name of the last path component used to create file + * @xattr_name pointer to place the pointer to security xattr name. + * Caller does not have to free the resulting pointer. Its + * a pointer to static string. * @ctx pointer to place the pointer to the resulting context in. * @ctxlen point to place the length of the resulting context. * @dentry_create_files_as: diff --git a/include/linux/security.h b/include/linux/security.h index cc6d39358336..7e0ba63b5dde 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -317,8 +317,9 @@ int security_add_mnt_opt(const char *option, const char *val, int len, void **mnt_opts); int security_move_mount(const struct path *from_path, const struct path *to_path); int security_dentry_init_security(struct dentry *dentry, int mode, - const struct qstr *name, void **ctx, - u32 *ctxlen); + const struct qstr *name, + const char **xattr_name, void **ctx, + u32 *ctxlen); int security_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, @@ -739,6 +740,7 @@ static inline void security_inode_free(struct inode *inode) static inline int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, + const char **xattr_name, void **ctx, u32 *ctxlen) { -- cgit v1.2.3 From cf6d6238cdd319eca404756dee05bf55a748b6a9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 19 Oct 2021 22:24:10 +0100 Subject: block: turn macro helpers into inline functions Replace bio_set_dev() with an identical inline helper and move it further to fix a dependency problem with bio_associate_blkg(). Do the same for bio_copy_dev(). Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/bio.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9538f20ffaa5..b12453d7b8a8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -430,22 +430,6 @@ void zero_fill_bio(struct bio *bio); extern const char *bio_devname(struct bio *bio, char *buffer); -#define bio_set_dev(bio, bdev) \ -do { \ - bio_clear_flag(bio, BIO_REMAPPED); \ - if ((bio)->bi_bdev != (bdev)) \ - bio_clear_flag(bio, BIO_THROTTLED); \ - (bio)->bi_bdev = (bdev); \ - bio_associate_blkg(bio); \ -} while (0) - -#define bio_copy_dev(dst, src) \ -do { \ - bio_clear_flag(dst, BIO_REMAPPED); \ - (dst)->bi_bdev = (src)->bi_bdev; \ - bio_clone_blkg_association(dst, src); \ -} while (0) - #define bio_dev(bio) \ disk_devt((bio)->bi_bdev->bd_disk) @@ -463,6 +447,22 @@ static inline void bio_clone_blkg_association(struct bio *dst, struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ +static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) +{ + bio_clear_flag(bio, BIO_REMAPPED); + if (bio->bi_bdev != bdev) + bio_clear_flag(bio, BIO_THROTTLED); + bio->bi_bdev = bdev; + bio_associate_blkg(bio); +} + +static inline void bio_copy_dev(struct bio *dst, struct bio *src) +{ + bio_clear_flag(dst, BIO_REMAPPED); + dst->bi_bdev = src->bi_bdev; + bio_clone_blkg_association(dst, src); +} + /* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * -- cgit v1.2.3 From c809084ab033a8d4ee404e2ac3c5d3dc80cb65f7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 19 Oct 2021 22:24:14 +0100 Subject: block: inline a part of bio_release_pages() Inline BIO_NO_PAGE_REF check of bio_release_pages() to avoid function call. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/bio.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index b12453d7b8a8..c88700d1bdc3 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -417,7 +417,7 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page, void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); -void bio_release_pages(struct bio *bio, bool mark_dirty); +void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); @@ -428,6 +428,12 @@ extern void bio_free_pages(struct bio *bio); void guard_bio_eod(struct bio *bio); void zero_fill_bio(struct bio *bio); +static inline void bio_release_pages(struct bio *bio, bool mark_dirty) +{ + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + __bio_release_pages(bio, mark_dirty); +} + extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_dev(bio) \ -- cgit v1.2.3 From dbb6f764a079d1dea883c6f2439d91db4f0fb2f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Oct 2021 16:41:17 +0200 Subject: blk-mq: move blk_mq_flush_plug_list to block/blk-mq.h This helper is internal to the block layer. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211020144119.142582-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6cf35de151a9..e13780236550 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -656,8 +656,6 @@ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, unsigned int set_flags); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); -void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); - void blk_mq_free_request(struct request *rq); bool blk_mq_queue_inflight(struct request_queue *q); -- cgit v1.2.3 From 008f75a20e7072d0840ec323c39b42206f3fa8a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Oct 2021 16:41:19 +0200 Subject: block: cleanup the flush plug helpers Consolidate the various helpers into a single blk_flush_plug helper that takes a plk_plug and the from_scheduler bool and switch all callsites to call it directly. Checks that the plug is non-NULL must be performed by the caller, something that most already do anyway. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211020144119.142582-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2b22fa36e568..c7b1e9355123 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -725,9 +725,8 @@ extern void blk_set_queue_dying(struct request_queue *); * as the lock contention for request_queue lock is reduced. * * It is ok not to disable preemption when adding the request to the plug list - * or when attempting a merge, because blk_schedule_flush_list() will only flush - * the plug list when the task sleeps by itself. For details, please see - * schedule() where blk_schedule_flush_plug() is called. + * or when attempting a merge. For details, please see schedule() where + * blk_flush_plug() is called. */ struct blk_plug { struct request *mq_list; /* blk-mq requests */ @@ -757,23 +756,8 @@ extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, extern void blk_start_plug(struct blk_plug *); extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short); extern void blk_finish_plug(struct blk_plug *); -extern void blk_flush_plug_list(struct blk_plug *, bool); -static inline void blk_flush_plug(struct task_struct *tsk) -{ - struct blk_plug *plug = tsk->plug; - - if (plug) - blk_flush_plug_list(plug, false); -} - -static inline void blk_schedule_flush_plug(struct task_struct *tsk) -{ - struct blk_plug *plug = tsk->plug; - - if (plug) - blk_flush_plug_list(plug, true); -} +void blk_flush_plug(struct blk_plug *plug, bool from_schedule); static inline bool blk_needs_flush_plug(struct task_struct *tsk) { @@ -802,15 +786,10 @@ static inline void blk_finish_plug(struct blk_plug *plug) { } -static inline void blk_flush_plug(struct task_struct *task) -{ -} - -static inline void blk_schedule_flush_plug(struct task_struct *task) +static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } - static inline bool blk_needs_flush_plug(struct task_struct *tsk) { return false; -- cgit v1.2.3 From 55df0933be74bd2e52aba0b67eb743ae0feabe7e Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Wed, 20 Oct 2021 14:09:00 +1100 Subject: workqueue: Introduce show_one_worker_pool and show_one_workqueue. Currently show_workqueue_state shows the state of all workqueues and of all worker pools. In certain cases we may need to dump state of only a specific workqueue or worker pool. For example in destroy_workqueue we only need to show state of the workqueue which is getting destroyed. So rename show_workqueue_state to show_all_workqueues(to signify it dumps state of all busy workqueues) and divide it into more granular functions (show_one_workqueue and show_one_worker_pool), that would show states of individual workqueues and worker pools and can be used in cases such as the one mentioned above. Also, as mentioned earlier, make destroy_workqueue dump data pertaining to only the workqueue that is being destroyed and make user(s) of earlier interface(show_workqueue_state), use new interface (show_all_workqueues). Signed-off-by: Imran Khan Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 74d3c1efd9bb..7fee9b6cfede 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -469,7 +469,8 @@ extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); -extern void show_workqueue_state(void); +extern void show_all_workqueues(void); +extern void show_one_workqueue(struct workqueue_struct *wq); extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task); /** -- cgit v1.2.3 From f783484381ad088490a497abb4faab47511774c5 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:35 +0200 Subject: mfd: ti_am335x_tscadc: Use driver data So far every sub-cell parameter in this driver was hardcoded: cell name, cell compatible, specific clock name and desired clock frequency. As we are about to introduce support for ADC1/magnetic reader, we need a bit of flexibility. Let's add a driver data structure which will contain these information. Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-18-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index ffc091b77633..bb3b56ade3fb 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -162,11 +162,20 @@ #define TSCADC_CELLS 2 +struct ti_tscadc_data { + char *adc_feature_name; + char *adc_feature_compatible; + char *secondary_feature_name; + char *secondary_feature_compatible; + unsigned int target_clk_rate; +}; + struct ti_tscadc_dev { struct device *dev; struct regmap *regmap; void __iomem *tscadc_base; phys_addr_t tscadc_phys_base; + const struct ti_tscadc_data *data; int irq; int used_cells; /* 1-2 */ int tsc_wires; -- cgit v1.2.3 From 7c605802f33176d872ffb6a3830ba4d88d9f21f6 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:37 +0200 Subject: mfd: ti_am335x_tscadc: Drop useless variables from the driver structure Keeping the count of tsc_cells and adc_cells is completely redundant, we can derive this information from other variables. Plus, these variables are not used anywhere else now. Let's get rid of them. Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-20-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index bb3b56ade3fb..23442059d271 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -177,10 +177,7 @@ struct ti_tscadc_dev { phys_addr_t tscadc_phys_base; const struct ti_tscadc_data *data; int irq; - int used_cells; /* 1-2 */ int tsc_wires; - int tsc_cell; /* -1 if not used */ - int adc_cell; /* -1 if not used */ struct mfd_cell cells[TSCADC_CELLS]; u32 reg_se_cache; bool adc_waiting; -- cgit v1.2.3 From b813f32030e2b70adf68f73e99853f91526aa3a1 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:40 +0200 Subject: mfd: ti_am335x_tscadc: Gather the ctrl register logic in one place Instead of deriving in the probe and in the resume path the value of the ctrl register, let's do it only once in the probe, save the value of this register (all but the subsystem enable bit) in the driver's structure and use it from the resume callback. Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-23-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index 23442059d271..c9c6f0b29181 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -177,8 +177,8 @@ struct ti_tscadc_dev { phys_addr_t tscadc_phys_base; const struct ti_tscadc_data *data; int irq; - int tsc_wires; struct mfd_cell cells[TSCADC_CELLS]; + u32 ctrl; u32 reg_se_cache; bool adc_waiting; bool adc_in_use; -- cgit v1.2.3 From 36782dab984a8b4ef8f4ec8c2270c56468cbbbeb Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:41 +0200 Subject: mfd: ti_am335x_tscadc: Replace the header license text with SPDX tag Drop the text license and replace it with an equivalent SPDX license tag identifier. Signed-off-by: Miquel Raynal Acked-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-24-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index c9c6f0b29181..2b8a5f58a3b6 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -1,21 +1,13 @@ -#ifndef __LINUX_TI_AM335X_TSCADC_MFD_H -#define __LINUX_TI_AM335X_TSCADC_MFD_H - +/* SPDX-License-Identifier: GPL-2.0-only */ /* * TI Touch Screen / ADC MFD driver * * Copyright (C) 2012 Texas Instruments Incorporated - https://www.ti.com/ - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation version 2. - * - * This program is distributed "as is" WITHOUT ANY WARRANTY of any - * kind, whether express or implied; without even the implied warranty - * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ +#ifndef __LINUX_TI_AM335X_TSCADC_MFD_H +#define __LINUX_TI_AM335X_TSCADC_MFD_H + #include #define REG_RAWIRQSTATUS 0x024 -- cgit v1.2.3 From 3831abe135566813341cc36b1493d75f6ac460c3 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:42 +0200 Subject: mfd: ti_am335x_tscadc: Fix header spacing Harmonize the spacing within macro definitions. Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-25-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index 2b8a5f58a3b6..893c474c1f8c 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -41,7 +41,7 @@ /* Step Enable */ #define STEPENB_MASK (0x1FFFF << 0) #define STEPENB(val) ((val) << 0) -#define ENB(val) (1 << (val)) +#define ENB(val) (1 << (val)) #define STPENB_STEPENB STEPENB(0x1FFFF) #define STPENB_STEPENB_TC STEPENB(0x1FFF) @@ -122,15 +122,15 @@ #define CNTRLREG_TSCENB BIT(7) /* FIFO READ Register */ -#define FIFOREAD_DATA_MASK (0xfff << 0) -#define FIFOREAD_CHNLID_MASK (0xf << 16) +#define FIFOREAD_DATA_MASK (0xfff << 0) +#define FIFOREAD_CHNLID_MASK (0xf << 16) /* DMA ENABLE/CLEAR Register */ #define DMA_FIFO0 BIT(0) #define DMA_FIFO1 BIT(1) /* Sequencer Status */ -#define SEQ_STATUS BIT(5) +#define SEQ_STATUS BIT(5) #define CHARGE_STEP 0x11 #define ADC_CLK 3000000 @@ -150,7 +150,7 @@ * * max processing time: 266431 * 308ns = 83ms(approx) */ -#define IDLE_TIMEOUT 83 /* milliseconds */ +#define IDLE_TIMEOUT 83 /* milliseconds */ #define TSCADC_CELLS 2 -- cgit v1.2.3 From 48959fcdca8b335afa4485e71114008c81291bf9 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:43 +0200 Subject: mfd: ti_am335x_tscadc: Use the new HZ_PER_MHZ macro Before adding another frequency with even more zeroes, use the HZ_PER_MHZ macro to clarify the number. Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-26-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index 893c474c1f8c..a85643677bef 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -9,6 +9,7 @@ #define __LINUX_TI_AM335X_TSCADC_MFD_H #include +#include #define REG_RAWIRQSTATUS 0x024 #define REG_IRQSTATUS 0x028 @@ -133,7 +134,7 @@ #define SEQ_STATUS BIT(5) #define CHARGE_STEP 0x11 -#define ADC_CLK 3000000 +#define ADC_CLK (3 * HZ_PER_MHZ) #define TOTAL_STEPS 16 #define TOTAL_CHANNELS 8 #define FIFO1_THRESHOLD 19 -- cgit v1.2.3 From 65de5532a317b22f7fb93001cd0ed494145d3f4e Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:44 +0200 Subject: mfd: ti_am335x_tscadc: Drop unused definitions from the header The STEP ENABLE definitions are highly unclear and not used so drop them. Suggested-by: Jonathan Cameron Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-27-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index a85643677bef..1cd8cd34f2b7 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -39,13 +39,6 @@ /* IRQ wakeup enable */ #define IRQWKUP_ENB BIT(0) -/* Step Enable */ -#define STEPENB_MASK (0x1FFFF << 0) -#define STEPENB(val) ((val) << 0) -#define ENB(val) (1 << (val)) -#define STPENB_STEPENB STEPENB(0x1FFFF) -#define STPENB_STEPENB_TC STEPENB(0x1FFF) - /* IRQ enable */ #define IRQENB_HW_PEN BIT(0) #define IRQENB_EOS BIT(1) -- cgit v1.2.3 From b7cb7bf11817129c623fa921a1d6cd86b3cb727b Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:45 +0200 Subject: mfd: ti_am335x_tscadc: Use BIT(), GENMASK() and FIELD_PREP() when relevant Clean the ti_am335x_tscadc.h header by: * converting masks to GENMASK() * converting regular shifts to BIT() * using FIELD_PREP() when relevant Sometimes reorder the lines to be able to use the relevant bitmask. Mind the s/%d/%ld/ change in a log due to the type change following the use of FIELD_PREP() in the header. Suggested-by: Jonathan Cameron Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-28-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 61 ++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index 1cd8cd34f2b7..ae694fa2d711 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -8,6 +8,7 @@ #ifndef __LINUX_TI_AM335X_TSCADC_MFD_H #define __LINUX_TI_AM335X_TSCADC_MFD_H +#include #include #include @@ -51,12 +52,12 @@ #define IRQENB_PENUP BIT(9) /* Step Configuration */ -#define STEPCONFIG_MODE_MASK (3 << 0) -#define STEPCONFIG_MODE(val) ((val) << 0) +#define STEPCONFIG_MODE_MASK GENMASK(1, 0) +#define STEPCONFIG_MODE(val) FIELD_PREP(STEPCONFIG_MODE_MASK, (val)) #define STEPCONFIG_MODE_SWCNT STEPCONFIG_MODE(1) #define STEPCONFIG_MODE_HWSYNC STEPCONFIG_MODE(2) -#define STEPCONFIG_AVG_MASK (7 << 2) -#define STEPCONFIG_AVG(val) ((val) << 2) +#define STEPCONFIG_AVG_MASK GENMASK(4, 2) +#define STEPCONFIG_AVG(val) FIELD_PREP(STEPCONFIG_AVG_MASK, (val)) #define STEPCONFIG_AVG_16 STEPCONFIG_AVG(4) #define STEPCONFIG_XPP BIT(5) #define STEPCONFIG_XNN BIT(6) @@ -64,43 +65,43 @@ #define STEPCONFIG_YNN BIT(8) #define STEPCONFIG_XNP BIT(9) #define STEPCONFIG_YPN BIT(10) -#define STEPCONFIG_RFP(val) ((val) << 12) -#define STEPCONFIG_RFP_VREFP (0x3 << 12) -#define STEPCONFIG_INM_MASK (0xF << 15) -#define STEPCONFIG_INM(val) ((val) << 15) +#define STEPCONFIG_RFP_VREFP GENMASK(13, 12) +#define STEPCONFIG_RFP(val) FIELD_PREP(STEPCONFIG_RFP_VREFP, (val)) +#define STEPCONFIG_INM_MASK GENMASK(18, 15) +#define STEPCONFIG_INM(val) FIELD_PREP(STEPCONFIG_INM_MASK, (val)) #define STEPCONFIG_INM_ADCREFM STEPCONFIG_INM(8) -#define STEPCONFIG_INP_MASK (0xF << 19) -#define STEPCONFIG_INP(val) ((val) << 19) +#define STEPCONFIG_INP_MASK GENMASK(22, 19) +#define STEPCONFIG_INP(val) FIELD_PREP(STEPCONFIG_INP_MASK, (val)) #define STEPCONFIG_INP_AN4 STEPCONFIG_INP(4) #define STEPCONFIG_INP_ADCREFM STEPCONFIG_INP(8) #define STEPCONFIG_FIFO1 BIT(26) -#define STEPCONFIG_RFM(val) ((val) << 23) -#define STEPCONFIG_RFM_VREFN (0x3 << 23) +#define STEPCONFIG_RFM_VREFN GENMASK(24, 23) +#define STEPCONFIG_RFM(val) FIELD_PREP(STEPCONFIG_RFM_VREFN, (val)) /* Delay register */ -#define STEPDELAY_OPEN_MASK (0x3FFFF << 0) -#define STEPDELAY_OPEN(val) ((val) << 0) +#define STEPDELAY_OPEN_MASK GENMASK(17, 0) +#define STEPDELAY_OPEN(val) FIELD_PREP(STEPDELAY_OPEN_MASK, (val)) #define STEPCONFIG_OPENDLY STEPDELAY_OPEN(0x098) -#define STEPDELAY_SAMPLE_MASK (0xFF << 24) -#define STEPDELAY_SAMPLE(val) ((val) << 24) +#define STEPDELAY_SAMPLE_MASK GENMASK(31, 24) +#define STEPDELAY_SAMPLE(val) FIELD_PREP(STEPDELAY_SAMPLE_MASK, (val)) #define STEPCONFIG_SAMPLEDLY STEPDELAY_SAMPLE(0) /* Charge Config */ -#define STEPCHARGE_RFP_MASK (7 << 12) -#define STEPCHARGE_RFP(val) ((val) << 12) +#define STEPCHARGE_RFP_MASK GENMASK(14, 12) +#define STEPCHARGE_RFP(val) FIELD_PREP(STEPCHARGE_RFP_MASK, (val)) #define STEPCHARGE_RFP_XPUL STEPCHARGE_RFP(1) -#define STEPCHARGE_INM_MASK (0xF << 15) -#define STEPCHARGE_INM(val) ((val) << 15) +#define STEPCHARGE_INM_MASK GENMASK(18, 15) +#define STEPCHARGE_INM(val) FIELD_PREP(STEPCHARGE_INM_MASK, (val)) #define STEPCHARGE_INM_AN1 STEPCHARGE_INM(1) -#define STEPCHARGE_INP_MASK (0xF << 19) -#define STEPCHARGE_INP(val) ((val) << 19) -#define STEPCHARGE_RFM_MASK (3 << 23) -#define STEPCHARGE_RFM(val) ((val) << 23) +#define STEPCHARGE_INP_MASK GENMASK(22, 19) +#define STEPCHARGE_INP(val) FIELD_PREP(STEPCHARGE_INP_MASK, (val)) +#define STEPCHARGE_RFM_MASK GENMASK(24, 23) +#define STEPCHARGE_RFM(val) FIELD_PREP(STEPCHARGE_RFM_MASK, (val)) #define STEPCHARGE_RFM_XNUR STEPCHARGE_RFM(1) /* Charge delay */ -#define CHARGEDLY_OPEN_MASK (0x3FFFF << 0) -#define CHARGEDLY_OPEN(val) ((val) << 0) +#define CHARGEDLY_OPEN_MASK GENMASK(17, 0) +#define CHARGEDLY_OPEN(val) FIELD_PREP(CHARGEDLY_OPEN_MASK, (val)) #define CHARGEDLY_OPENDLY CHARGEDLY_OPEN(0x400) /* Control register */ @@ -108,16 +109,16 @@ #define CNTRLREG_STEPID BIT(1) #define CNTRLREG_STEPCONFIGWRT BIT(2) #define CNTRLREG_POWERDOWN BIT(4) -#define CNTRLREG_AFE_CTRL_MASK (3 << 5) -#define CNTRLREG_AFE_CTRL(val) ((val) << 5) +#define CNTRLREG_AFE_CTRL_MASK GENMASK(6, 5) +#define CNTRLREG_AFE_CTRL(val) FIELD_PREP(CNTRLREG_AFE_CTRL_MASK, (val)) #define CNTRLREG_4WIRE CNTRLREG_AFE_CTRL(1) #define CNTRLREG_5WIRE CNTRLREG_AFE_CTRL(2) #define CNTRLREG_8WIRE CNTRLREG_AFE_CTRL(3) #define CNTRLREG_TSCENB BIT(7) /* FIFO READ Register */ -#define FIFOREAD_DATA_MASK (0xfff << 0) -#define FIFOREAD_CHNLID_MASK (0xf << 16) +#define FIFOREAD_DATA_MASK GENMASK(11, 0) +#define FIFOREAD_CHNLID_MASK GENMASK(19, 16) /* DMA ENABLE/CLEAR Register */ #define DMA_FIFO0 BIT(0) -- cgit v1.2.3 From 01d838164b4c305c1cafb0c3f71fb0027d99358b Mon Sep 17 00:00:00 2001 From: Saurav Kashyap Date: Mon, 23 Aug 2021 05:56:48 -0700 Subject: nvme-fc: add support for ->map_queues NVMe FC don't have support for map queues, unlike the PCI, RDMA and TCP transports. Add a ->map_queues callout for the LLDDs to provide such functionality. Signed-off-by: Saurav Kashyap Signed-off-by: Nilesh Javali Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc-driver.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 2a38f2b477a5..cb909edb76c4 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -7,6 +7,7 @@ #define _NVME_FC_DRIVER_H 1 #include +#include /* @@ -497,6 +498,8 @@ struct nvme_fc_port_template { int (*xmt_ls_rsp)(struct nvme_fc_local_port *localport, struct nvme_fc_remote_port *rport, struct nvmefc_ls_rsp *ls_rsp); + void (*map_queues)(struct nvme_fc_local_port *localport, + struct blk_mq_queue_map *map); u32 max_hw_queues; u16 max_sgl_segments; @@ -779,6 +782,10 @@ struct nvmet_fc_target_port { * LS received. * Entrypoint is Mandatory. * + * @map_queues: This functions lets the driver expose the queue mapping + * to the block layer. + * Entrypoint is Optional. + * * @fcp_op: Called to perform a data transfer or transmit a response. * The nvmefc_tgt_fcp_req structure is the same LLDD-supplied * exchange structure specified in the nvmet_fc_rcv_fcp_req() call -- cgit v1.2.3 From 44c3c6257e99c6284f312206de73783575fc8906 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 23 Sep 2021 00:55:35 +0300 Subject: nvme-rdma: limit the maximal queue size for RDMA controllers Corrent limit of 1024 isn't valid for some of the RDMA based ctrls. In case the target expose a cap of larger amount of entries (e.g. 1024), the initiator may fail to create a QP with this size. Thus limit to a value that works for all RDMA adapters. Future general solution should use RDMA/core API to calculate this size according to device capabilities and number of WRs needed per NVMe IO request. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- include/linux/nvme-rdma.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index 3ec8e50efa16..4dd7e6fe92fb 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -6,6 +6,8 @@ #ifndef _LINUX_NVME_RDMA_H #define _LINUX_NVME_RDMA_H +#define NVME_RDMA_MAX_QUEUE_SIZE 128 + enum nvme_rdma_cm_fmt { NVME_RDMA_CM_FMT_1_0 = 0x0, }; -- cgit v1.2.3 From e15a8a9755659ff5972f30de4dd64867c97f242d Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 22 Sep 2021 08:35:20 +0200 Subject: nvme: add CNTRLTYPE definitions for 'identify controller' Update the 'identify controller' structure to define the newly added CNTRLTYPE field. Signed-off-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index b7c4c4130b65..ed2428918bca 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -31,6 +31,12 @@ enum nvme_subsys_type { NVME_NQN_NVME = 2, /* NVME type target subsystem */ }; +enum nvme_ctrl_type { + NVME_CTRL_IO = 1, /* I/O controller */ + NVME_CTRL_DISC = 2, /* Discovery controller */ + NVME_CTRL_ADMIN = 3, /* Administrative controller */ +}; + /* Address Family codes for Discovery Log Page entry ADRFAM field */ enum { NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */ @@ -244,7 +250,9 @@ struct nvme_id_ctrl { __le32 rtd3e; __le32 oaes; __le32 ctratt; - __u8 rsvd100[28]; + __u8 rsvd100[11]; + __u8 cntrltype; + __u8 fguid[16]; __le16 crdt1; __le16 crdt2; __le16 crdt3; -- cgit v1.2.3 From d56ae18f063e38eba47550c632519c9fe3f76b19 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 23 Sep 2021 13:17:44 +0300 Subject: nvmet: use macro definitions for setting cmic value This makes the code more readable. Signed-off-by: Max Gurtovoy Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index ed2428918bca..357482dedb59 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -320,6 +320,7 @@ struct nvme_id_ctrl { }; enum { + NVME_CTRL_CMIC_MULTI_PORT = 1 << 0, NVME_CTRL_CMIC_MULTI_CTRL = 1 << 1, NVME_CTRL_CMIC_ANA = 1 << 3, NVME_CTRL_ONCS_COMPARE = 1 << 0, -- cgit v1.2.3 From cdd591fc86e38ad3899196066219fbbd845f3162 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 5 Jul 2021 17:26:28 +0200 Subject: iov_iter: Introduce fault_in_iov_iter_writeable Introduce a new fault_in_iov_iter_writeable helper for safely faulting in an iterator for writing. Uses get_user_pages() to fault in the pages without actually writing to them, which would be destructive. We'll use fault_in_iov_iter_writeable in gfs2 once we've determined that the iterator passed to .read_iter isn't in memory. Signed-off-by: Andreas Gruenbacher --- include/linux/pagemap.h | 1 + include/linux/uio.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 9fe94f7a4f7e..2f7dd14083d9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -736,6 +736,7 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); * Fault in userspace address range. */ size_t fault_in_writeable(char __user *uaddr, size_t size); +size_t fault_in_safe_writeable(const char __user *uaddr, size_t size); size_t fault_in_readable(const char __user *uaddr, size_t size); int add_to_page_cache_locked(struct page *page, struct address_space *mapping, diff --git a/include/linux/uio.h b/include/linux/uio.h index d18458af6681..25d1c24fd829 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -134,6 +134,7 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); +size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); -- cgit v1.2.3 From a164ff53cbd34479aeac3366840669b10845ce53 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 14 Oct 2021 16:47:54 +0300 Subject: driver core: Provide device_match_acpi_handle() helper We have a couple of users of this helper, make it available for them. The prototype for the helper is specifically crafted in order to be easily used with bus_find_device() call. That's why its location is in the driver core rather than ACPI. Reviewed-by: Rafael J. Wysocki Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20211014134756.39092-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/bus.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index 062777a45a74..a039ab809753 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -143,6 +143,7 @@ int device_match_of_node(struct device *dev, const void *np); int device_match_fwnode(struct device *dev, const void *fwnode); int device_match_devt(struct device *dev, const void *pdevt); int device_match_acpi_dev(struct device *dev, const void *adev); +int device_match_acpi_handle(struct device *dev, const void *handle); int device_match_any(struct device *dev, const void *unused); /* iterator helpers for buses */ -- cgit v1.2.3 From 14fe2471c62816ba82546fb68369d957c3a58b59 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Thu, 7 Oct 2021 16:05:38 +0300 Subject: net/mlx5: Lag, change multipath and bonding to be mutually exclusive Both multipath and bonding events are changing the HW LAG state independently. Handling one of the features events while the other is already enabled can cause unwanted behavior, for example handling bonding event while multipath enabled will disable the lag and cause multipath to stop working. Fix it by ignoring bonding event while in multipath and ignoring FIB events while in bonding mode. Fixes: 544fe7c2e654 ("net/mlx5e: Activate HW multipath and handle port affinity based on FIB events") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e23417424373..f17d2101af7a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1138,7 +1138,6 @@ int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev); int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_roce(struct mlx5_core_dev *dev); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); -bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); -- cgit v1.2.3 From f1985002839af80d6c84e9537834a81fb1364d6e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 20 Oct 2021 09:33:21 +0100 Subject: irqchip: Provide stronger type checking for IRQCHIP_MATCH/IRQCHIP_DECLARE Both IRQCHIP_DECLARE() and IRQCHIP_MATCH() use an underlying of_device_id() structure to encode the matching property and the init callback. However, this callback is stored in as a void * pointer, which obviously defeat any attempt at stronger type checking. Work around this by providing a new macro that builds on top of the __typecheck() primitive, and that can be used to warn when there is a discrepency between the drivers and core code. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20211020104527.3066268-1-maz@kernel.org --- include/linux/irqchip.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index 67351aac65ef..5de0dfc5d64d 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -14,8 +14,15 @@ #include #include #include +#include #include +/* Undefined on purpose */ +extern of_irq_init_cb_t typecheck_irq_init_cb; + +#define typecheck_irq_init_cb(fn) \ + (__typecheck(typecheck_irq_init_cb, &fn) ? fn : fn) + /* * This macro must be used by the different irqchip drivers to declare * the association between their DT compatible string and their @@ -26,14 +33,16 @@ * @compstr: compatible string of the irqchip driver * @fn: initialization function */ -#define IRQCHIP_DECLARE(name, compat, fn) OF_DECLARE_2(irqchip, name, compat, fn) +#define IRQCHIP_DECLARE(name, compat, fn) \ + OF_DECLARE_2(irqchip, name, compat, typecheck_irq_init_cb(fn)) extern int platform_irqchip_probe(struct platform_device *pdev); #define IRQCHIP_PLATFORM_DRIVER_BEGIN(drv_name) \ static const struct of_device_id drv_name##_irqchip_match_table[] = { -#define IRQCHIP_MATCH(compat, fn) { .compatible = compat, .data = fn }, +#define IRQCHIP_MATCH(compat, fn) { .compatible = compat, \ + .data = typecheck_irq_init_cb(fn), }, #define IRQCHIP_PLATFORM_DRIVER_END(drv_name) \ {}, \ -- cgit v1.2.3 From 795e92ec5fd79027648bd7f779d34bad5b6f2f55 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 6 Oct 2021 11:43:21 -0500 Subject: of: Add of_get_cpu_hwid() to read hardware ID from CPU nodes There are various open coded implementions parsing the CPU node 'reg' property which contains the CPU's hardware ID. Introduce a new function, of_get_cpu_hwid(), to read the hardware ID. All the callers should be DT only code, so no need for an empty function. Cc: Frank Rowand Signed-off-by: Rob Herring Tested-by: Florian Fainelli Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20211006164332.1981454-2-robh@kernel.org --- include/linux/of.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 6f1c41f109bb..807f8168dad9 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -353,6 +353,7 @@ extern struct device_node *of_get_cpu_node(int cpu, unsigned int *thread); extern struct device_node *of_get_next_cpu_node(struct device_node *prev); extern struct device_node *of_get_cpu_state_node(struct device_node *cpu_node, int index); +extern u64 of_get_cpu_hwid(struct device_node *cpun, unsigned int thread); #define for_each_property_of_node(dn, pp) \ for (pp = dn->properties; pp != NULL; pp = pp->next) -- cgit v1.2.3 From a3c85b2ee098c4a293148fab4eb96299e92b9524 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 14 Oct 2021 12:30:55 -0500 Subject: of: make of_node_check_flag() device_node parameter const The device_node argument isn't modified by of_node_check_flag(), so mark it const. Signed-off-by: Nathan Lynch Link: https://lore.kernel.org/r/20211014173055.2117872-1-nathanl@linux.ibm.com Signed-off-by: Rob Herring --- include/linux/of.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 807f8168dad9..ff143a027abc 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -185,7 +185,7 @@ static inline bool of_node_is_root(const struct device_node *node) return node && (node->parent == NULL); } -static inline int of_node_check_flag(struct device_node *n, unsigned long flag) +static inline int of_node_check_flag(const struct device_node *n, unsigned long flag) { return test_bit(flag, &n->_flags); } -- cgit v1.2.3 From b8419e7be6c6029eee3448fda45f4f9ad340c4ca Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Oct 2021 11:48:59 -0700 Subject: irqchip: Fix kernel-doc parameter typo for IRQCHIP_DECLARE The documentation refers to "compstr" when we have the parameter named "compat", fix the typo. Signed-off-by: Florian Fainelli Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20211020184859.2705451-14-f.fainelli@gmail.com --- include/linux/irqchip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index 5de0dfc5d64d..7f007b9c23f8 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -30,7 +30,7 @@ extern of_irq_init_cb_t typecheck_irq_init_cb; * * @name: name that must be unique across all IRQCHIP_DECLARE of the * same file. - * @compstr: compatible string of the irqchip driver + * @compat: compatible string of the irqchip driver * @fn: initialization function */ #define IRQCHIP_DECLARE(name, compat, fn) \ -- cgit v1.2.3 From 133a48abf6ecc535d7eddc6da1c3e4c972445882 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Oct 2021 15:37:42 -0400 Subject: NFS: Fix up commit deadlocks If O_DIRECT bumps the commit_info rpcs_out field, then that could lead to fsync() hangs. The fix is to ensure that O_DIRECT calls nfs_commit_end(). Fixes: 723c921e7dfc ("sched/wait, fs/nfs: Convert wait_on_atomic_t() usage to the new wait_var_event() API") Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index ca547cc5458c..e58b78da7a98 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -568,6 +568,7 @@ extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail); extern void nfs_commit_free(struct nfs_commit_data *data); +bool nfs_commit_end(struct nfs_mds_commit_info *cinfo); static inline int nfs_have_writebacks(struct inode *inode) -- cgit v1.2.3 From 0ebeebcf59601bcfa0284f4bb7abdec051eb856d Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Sun, 10 Oct 2021 18:23:13 -0400 Subject: NFS: Fix WARN_ON due to unionization of nfs_inode.nrequests Fixes the following WARN_ON WARNING: CPU: 2 PID: 18678 at fs/nfs/inode.c:123 nfs_clear_inode+0x3b/0x50 [nfs] ... Call Trace: nfs4_evict_inode+0x57/0x70 [nfsv4] evict+0xd1/0x180 dispose_list+0x48/0x60 evict_inodes+0x156/0x190 generic_shutdown_super+0x37/0x110 nfs_kill_super+0x1d/0x40 [nfs] deactivate_locked_super+0x36/0xa0 Signed-off-by: Dave Wysochanski Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index e58b78da7a98..457b866a2d9e 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -573,7 +573,9 @@ bool nfs_commit_end(struct nfs_mds_commit_info *cinfo); static inline int nfs_have_writebacks(struct inode *inode) { - return atomic_long_read(&NFS_I(inode)->nrequests) != 0; + if (S_ISREG(inode->i_mode)) + return atomic_long_read(&NFS_I(inode)->nrequests) != 0; + return 0; } /* -- cgit v1.2.3 From e591b298d7ecb851e200f65946e3d53fe78a3c4f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 28 Sep 2021 17:41:41 -0400 Subject: NFS: Save some space in the inode Save some space in the nfs_inode by setting up an anonymous union with the fields that are peculiar to a specific type of filesystem object. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 457b866a2d9e..739ca1ef934f 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -155,33 +155,39 @@ struct nfs_inode { unsigned long attrtimeo_timestamp; unsigned long attr_gencount; - /* "Generation counter" for the attribute cache. This is - * bumped whenever we update the metadata on the - * server. - */ - unsigned long cache_change_attribute; struct rb_root access_cache; struct list_head access_cache_entry_lru; struct list_head access_cache_inode_lru; - /* - * This is the cookie verifier used for NFSv3 readdir - * operations - */ - __be32 cookieverf[NFS_DIR_VERIFIER_SIZE]; - - atomic_long_t nrequests; - struct nfs_mds_commit_info commit_info; + union { + /* Directory */ + struct { + /* "Generation counter" for the attribute cache. + * This is bumped whenever we update the metadata + * on the server. + */ + unsigned long cache_change_attribute; + /* + * This is the cookie verifier used for NFSv3 readdir + * operations + */ + __be32 cookieverf[NFS_DIR_VERIFIER_SIZE]; + /* Readers: in-flight sillydelete RPC calls */ + /* Writers: rmdir */ + struct rw_semaphore rmdir_sem; + }; + /* Regular file */ + struct { + atomic_long_t nrequests; + struct nfs_mds_commit_info commit_info; + struct mutex commit_mutex; + }; + }; /* Open contexts for shared mmap writes */ struct list_head open_files; - /* Readers: in-flight sillydelete RPC calls */ - /* Writers: rmdir */ - struct rw_semaphore rmdir_sem; - struct mutex commit_mutex; - #if IS_ENABLED(CONFIG_NFS_V4) struct nfs4_cached_acl *nfs4_acl; /* NFSv4 state */ -- cgit v1.2.3 From 0c0593b45c9b4e5b212ffb3fb28bb8d3c0ec0dc8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Oct 2021 11:13:31 +0200 Subject: x86/ftrace: Make function graph use ftrace directly We don't need special hook for graph tracer entry point, but instead we can use graph_ops::func function to install the return_hooker. This moves the graph tracing setup _before_ the direct trampoline prepares the stack, so the return_hooker will be called when the direct trampoline is finished. This simplifies the code, because we don't need to take into account the direct trampoline setup when preparing the graph tracer hooker and we can allow function graph tracer on entries registered with direct trampoline. Link: https://lkml.kernel.org/r/20211008091336.33616-4-jolsa@kernel.org [fixed compile error reported by kernel test robot ] Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 12fcfa2d23ea..16a7baaba702 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -803,6 +803,15 @@ static inline bool is_ftrace_trampoline(unsigned long addr) } #endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#ifndef ftrace_graph_func +#define ftrace_graph_func ftrace_stub +#define FTRACE_OPS_GRAPH_STUB FTRACE_OPS_FL_STUB +#else +#define FTRACE_OPS_GRAPH_STUB 0 +#endif +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* totally disable ftrace - can not re-enable after this */ void ftrace_kill(void); -- cgit v1.2.3 From e967b60eb51116d2347093655e555aa036dd40d8 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:46 +0200 Subject: mfd: ti_am335x_tscadc: Clarify the maximum values for DT entries Clearly define the maximum open delay and sample delay. Use these definitions in place of a mask (which works because this is the first field in the register) and an open-coded value. While at it reword a little bit the error messages to make them look clearer and similar. Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-29-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index ae694fa2d711..31cffb6e8b17 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -84,7 +84,9 @@ #define STEPCONFIG_OPENDLY STEPDELAY_OPEN(0x098) #define STEPDELAY_SAMPLE_MASK GENMASK(31, 24) #define STEPDELAY_SAMPLE(val) FIELD_PREP(STEPDELAY_SAMPLE_MASK, (val)) +#define STEPCONFIG_MAX_OPENDLY GENMASK(17, 0) #define STEPCONFIG_SAMPLEDLY STEPDELAY_SAMPLE(0) +#define STEPCONFIG_MAX_SAMPLE GENMASK(7, 0) /* Charge Config */ #define STEPCHARGE_RFP_MASK GENMASK(14, 12) -- cgit v1.2.3 From 0fd12262613116a38fd76aefa396e9bc116fbfe2 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:47 +0200 Subject: mfd: ti_am335x_tscadc: Drop useless definitions from the header Drop useless definitions from the header like the "masks" definitions which are only used by the following definition. It could be possible to got even further by removing these definitions entirely and use FIELD_PREP() macros from the code directly, but while I have no troubles making these changes in the header, changing the values in the code directly could darkening a bit the logic and hardening future git-blames for very little added value IMHO (but this is of course a personal taste). Certain macros are using GENMASK() to define the value of a particular field, while this is purely "by chance" that the value and the mask have the same value. In this case, drop the "mask" definition, use FIELD_PREP() and GENMASK() in the macro defining the field, and use the new macro to define the particular value by feeding directly the actual number advertised in the datasheet into that macro, as in: -#define STEPCONFIG_RFM_VREFN GENMASK(24, 23) -#define STEPCONFIG_RFM(val) FIELD_PREP(STEPCONFIG_RFM_VREFN, (val)) +#define STEPCONFIG_RFM(val) FIELD_PREP(GENMASK(24, 23), (val)) +#define STEPCONFIG_RFM_VREFN STEPCONFIG_RFM(3) Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-30-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 44 +++++++++++++----------------------- 1 file changed, 16 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index 31cffb6e8b17..51d987080cd3 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -52,12 +52,10 @@ #define IRQENB_PENUP BIT(9) /* Step Configuration */ -#define STEPCONFIG_MODE_MASK GENMASK(1, 0) -#define STEPCONFIG_MODE(val) FIELD_PREP(STEPCONFIG_MODE_MASK, (val)) +#define STEPCONFIG_MODE(val) FIELD_PREP(GENMASK(1, 0), (val)) #define STEPCONFIG_MODE_SWCNT STEPCONFIG_MODE(1) #define STEPCONFIG_MODE_HWSYNC STEPCONFIG_MODE(2) -#define STEPCONFIG_AVG_MASK GENMASK(4, 2) -#define STEPCONFIG_AVG(val) FIELD_PREP(STEPCONFIG_AVG_MASK, (val)) +#define STEPCONFIG_AVG(val) FIELD_PREP(GENMASK(4, 2), (val)) #define STEPCONFIG_AVG_16 STEPCONFIG_AVG(4) #define STEPCONFIG_XPP BIT(5) #define STEPCONFIG_XNN BIT(6) @@ -65,45 +63,36 @@ #define STEPCONFIG_YNN BIT(8) #define STEPCONFIG_XNP BIT(9) #define STEPCONFIG_YPN BIT(10) -#define STEPCONFIG_RFP_VREFP GENMASK(13, 12) -#define STEPCONFIG_RFP(val) FIELD_PREP(STEPCONFIG_RFP_VREFP, (val)) -#define STEPCONFIG_INM_MASK GENMASK(18, 15) -#define STEPCONFIG_INM(val) FIELD_PREP(STEPCONFIG_INM_MASK, (val)) +#define STEPCONFIG_RFP(val) FIELD_PREP(GENMASK(13, 12), (val)) +#define STEPCONFIG_RFP_VREFP STEPCONFIG_RFP(3) +#define STEPCONFIG_INM(val) FIELD_PREP(GENMASK(18, 15), (val)) #define STEPCONFIG_INM_ADCREFM STEPCONFIG_INM(8) -#define STEPCONFIG_INP_MASK GENMASK(22, 19) -#define STEPCONFIG_INP(val) FIELD_PREP(STEPCONFIG_INP_MASK, (val)) +#define STEPCONFIG_INP(val) FIELD_PREP(GENMASK(22, 19), (val)) #define STEPCONFIG_INP_AN4 STEPCONFIG_INP(4) #define STEPCONFIG_INP_ADCREFM STEPCONFIG_INP(8) #define STEPCONFIG_FIFO1 BIT(26) -#define STEPCONFIG_RFM_VREFN GENMASK(24, 23) -#define STEPCONFIG_RFM(val) FIELD_PREP(STEPCONFIG_RFM_VREFN, (val)) +#define STEPCONFIG_RFM(val) FIELD_PREP(GENMASK(24, 23), (val)) +#define STEPCONFIG_RFM_VREFN STEPCONFIG_RFM(3) /* Delay register */ -#define STEPDELAY_OPEN_MASK GENMASK(17, 0) -#define STEPDELAY_OPEN(val) FIELD_PREP(STEPDELAY_OPEN_MASK, (val)) +#define STEPDELAY_OPEN(val) FIELD_PREP(GENMASK(17, 0), (val)) #define STEPCONFIG_OPENDLY STEPDELAY_OPEN(0x098) -#define STEPDELAY_SAMPLE_MASK GENMASK(31, 24) -#define STEPDELAY_SAMPLE(val) FIELD_PREP(STEPDELAY_SAMPLE_MASK, (val)) #define STEPCONFIG_MAX_OPENDLY GENMASK(17, 0) +#define STEPDELAY_SAMPLE(val) FIELD_PREP(GENMASK(31, 24), (val)) #define STEPCONFIG_SAMPLEDLY STEPDELAY_SAMPLE(0) #define STEPCONFIG_MAX_SAMPLE GENMASK(7, 0) /* Charge Config */ -#define STEPCHARGE_RFP_MASK GENMASK(14, 12) -#define STEPCHARGE_RFP(val) FIELD_PREP(STEPCHARGE_RFP_MASK, (val)) +#define STEPCHARGE_RFP(val) FIELD_PREP(GENMASK(14, 12), (val)) #define STEPCHARGE_RFP_XPUL STEPCHARGE_RFP(1) -#define STEPCHARGE_INM_MASK GENMASK(18, 15) -#define STEPCHARGE_INM(val) FIELD_PREP(STEPCHARGE_INM_MASK, (val)) +#define STEPCHARGE_INM(val) FIELD_PREP(GENMASK(18, 15), (val)) #define STEPCHARGE_INM_AN1 STEPCHARGE_INM(1) -#define STEPCHARGE_INP_MASK GENMASK(22, 19) -#define STEPCHARGE_INP(val) FIELD_PREP(STEPCHARGE_INP_MASK, (val)) -#define STEPCHARGE_RFM_MASK GENMASK(24, 23) -#define STEPCHARGE_RFM(val) FIELD_PREP(STEPCHARGE_RFM_MASK, (val)) +#define STEPCHARGE_INP(val) FIELD_PREP(GENMASK(22, 19), (val)) +#define STEPCHARGE_RFM(val) FIELD_PREP(GENMASK(24, 23), (val)) #define STEPCHARGE_RFM_XNUR STEPCHARGE_RFM(1) /* Charge delay */ -#define CHARGEDLY_OPEN_MASK GENMASK(17, 0) -#define CHARGEDLY_OPEN(val) FIELD_PREP(CHARGEDLY_OPEN_MASK, (val)) +#define CHARGEDLY_OPEN(val) FIELD_PREP(GENMASK(17, 0), (val)) #define CHARGEDLY_OPENDLY CHARGEDLY_OPEN(0x400) /* Control register */ @@ -111,8 +100,7 @@ #define CNTRLREG_STEPID BIT(1) #define CNTRLREG_STEPCONFIGWRT BIT(2) #define CNTRLREG_POWERDOWN BIT(4) -#define CNTRLREG_AFE_CTRL_MASK GENMASK(6, 5) -#define CNTRLREG_AFE_CTRL(val) FIELD_PREP(CNTRLREG_AFE_CTRL_MASK, (val)) +#define CNTRLREG_AFE_CTRL(val) FIELD_PREP(GENMASK(6, 5), (val)) #define CNTRLREG_4WIRE CNTRLREG_AFE_CTRL(1) #define CNTRLREG_5WIRE CNTRLREG_AFE_CTRL(2) #define CNTRLREG_8WIRE CNTRLREG_AFE_CTRL(3) -- cgit v1.2.3 From c3e36b5d069241f3cbc0e647cf297c5a329cd7a6 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:48 +0200 Subject: mfd: ti_am335x_tscadc: Rename the subsystem enable macro This bit is common to all devices (ADC, Touchscreen, Magnetic reader) so make it clear that it can be used from any location by operating a mechanical rename: s/CNTRLREG_TSCSSENB/CNTRLREG_SSENB/ Signed-off-by: Miquel Raynal Acked-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-31-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index 51d987080cd3..860289ae8516 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -96,7 +96,7 @@ #define CHARGEDLY_OPENDLY CHARGEDLY_OPEN(0x400) /* Control register */ -#define CNTRLREG_TSCSSENB BIT(0) +#define CNTRLREG_SSENB BIT(0) #define CNTRLREG_STEPID BIT(1) #define CNTRLREG_STEPCONFIGWRT BIT(2) #define CNTRLREG_POWERDOWN BIT(4) -- cgit v1.2.3 From 2f89c2619ce93bf2b0e6e721614fc33e8cf48f03 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:49 +0200 Subject: mfd: ti_am335x_tscadc: Add TSC prefix in certain macros While the register list (and names) between ADC0 and ADC1 are pretty close, the bits inside changed a little bit. To avoid any future confusion, let's add the TSC prefix when some bits are in a register that is common to both revisions of the ADC, but are specific to the am33xx hardware. Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-32-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index 860289ae8516..cc6de9258455 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -98,13 +98,13 @@ /* Control register */ #define CNTRLREG_SSENB BIT(0) #define CNTRLREG_STEPID BIT(1) -#define CNTRLREG_STEPCONFIGWRT BIT(2) +#define CNTRLREG_TSC_STEPCONFIGWRT BIT(2) #define CNTRLREG_POWERDOWN BIT(4) -#define CNTRLREG_AFE_CTRL(val) FIELD_PREP(GENMASK(6, 5), (val)) -#define CNTRLREG_4WIRE CNTRLREG_AFE_CTRL(1) -#define CNTRLREG_5WIRE CNTRLREG_AFE_CTRL(2) -#define CNTRLREG_8WIRE CNTRLREG_AFE_CTRL(3) -#define CNTRLREG_TSCENB BIT(7) +#define CNTRLREG_TSC_AFE_CTRL(val) FIELD_PREP(GENMASK(6, 5), (val)) +#define CNTRLREG_TSC_4WIRE CNTRLREG_TSC_AFE_CTRL(1) +#define CNTRLREG_TSC_5WIRE CNTRLREG_TSC_AFE_CTRL(2) +#define CNTRLREG_TSC_8WIRE CNTRLREG_TSC_AFE_CTRL(3) +#define CNTRLREG_TSC_ENB BIT(7) /* FIFO READ Register */ #define FIFOREAD_DATA_MASK GENMASK(11, 0) @@ -118,7 +118,7 @@ #define SEQ_STATUS BIT(5) #define CHARGE_STEP 0x11 -#define ADC_CLK (3 * HZ_PER_MHZ) +#define TSC_ADC_CLK (3 * HZ_PER_MHZ) #define TOTAL_STEPS 16 #define TOTAL_CHANNELS 8 #define FIFO1_THRESHOLD 19 -- cgit v1.2.3 From bf0f394c7b1e4d623ca2afdf59b3b4b95cc6f592 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:53 +0200 Subject: mfd: ti_am335x_tscadc: Introduce a helper to deal with the type of hardware One way of knowing which hardware we are dealing with is to check the compatible string. When this must be done at several places, it's best and certainly more clear to use a helper for that. Introduce ti_adc_with_touchscreen() to indicate if there is a touchscreen controller available (meaning it's an am33xx-like ADC). This helper does not indicate if it is actually used (that is the purpose of the use_tsc boolean). Introducing this helper helps making a difference in the code between what is generic to both types of ADCs and what is specific to the am33xx hardware before introducing support for the am437x hardware. Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-36-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index cc6de9258455..ee160b2036c1 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -177,6 +177,12 @@ static inline struct ti_tscadc_dev *ti_tscadc_dev_get(struct platform_device *p) return *tscadc_dev; } +static inline bool ti_adc_with_touchscreen(struct ti_tscadc_dev *tscadc) +{ + return of_device_is_compatible(tscadc->dev->of_node, + "ti,am3359-tscadc"); +} + void am335x_tsc_se_set_cache(struct ti_tscadc_dev *tsadc, u32 val); void am335x_tsc_se_set_once(struct ti_tscadc_dev *tsadc, u32 val); void am335x_tsc_se_clr(struct ti_tscadc_dev *tsadc, u32 val); -- cgit v1.2.3 From 0a1233031c16d8575be6b864e6fd353b6fd758c4 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:14:54 +0200 Subject: mfd: ti_am335x_tscadc: Add ADC1/magnetic reader support Introduce a new compatible that has another set of driver data, targeting am437x SoCs with a magnetic reader instead of the touchscreen and a more featureful set of registers. Signed-off-by: Miquel Raynal Reviewed-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-37-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index ee160b2036c1..5225e3fc194d 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -106,6 +106,11 @@ #define CNTRLREG_TSC_8WIRE CNTRLREG_TSC_AFE_CTRL(3) #define CNTRLREG_TSC_ENB BIT(7) +/*Control registers bitfields for MAGADC IP */ +#define CNTRLREG_MAGADCENB BIT(0) +#define CNTRLREG_MAG_PREAMP_PWRDOWN BIT(5) +#define CNTRLREG_MAG_PREAMP_BYPASS BIT(6) + /* FIFO READ Register */ #define FIFOREAD_DATA_MASK GENMASK(11, 0) #define FIFOREAD_CHNLID_MASK GENMASK(19, 16) @@ -119,6 +124,7 @@ #define CHARGE_STEP 0x11 #define TSC_ADC_CLK (3 * HZ_PER_MHZ) +#define MAG_ADC_CLK (13 * HZ_PER_MHZ) #define TOTAL_STEPS 16 #define TOTAL_CHANNELS 8 #define FIFO1_THRESHOLD 19 -- cgit v1.2.3 From 789e5ebcc61b72a71717b02b62e6c6f10fdbb722 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Oct 2021 10:15:01 +0200 Subject: iio: adc: ti_am335x_adc: Add a unit to the timeout delay The lack of unit in the macro name kind of tricked me when I was troubleshooting an issue. Physical constants should always get a unit. Signed-off-by: Miquel Raynal Acked-by: Jonathan Cameron Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211015081506.933180-44-miquel.raynal@bootlin.com --- include/linux/mfd/ti_am335x_tscadc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h index 5225e3fc194d..ba13e043d910 100644 --- a/include/linux/mfd/ti_am335x_tscadc.h +++ b/include/linux/mfd/ti_am335x_tscadc.h @@ -141,7 +141,7 @@ * * max processing time: 266431 * 308ns = 83ms(approx) */ -#define IDLE_TIMEOUT 83 /* milliseconds */ +#define IDLE_TIMEOUT_MS 83 /* milliseconds */ #define TSCADC_CELLS 2 -- cgit v1.2.3 From 9a48e7564ac83fb0f1d5b0eac5fe8a7af62da398 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Oct 2021 13:00:39 -0700 Subject: compiler-gcc.h: Define __SANITIZE_ADDRESS__ under hwaddress sanitizer When Clang is using the hwaddress sanitizer, it sets __SANITIZE_ADDRESS__ explicitly: #if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer) /* Emulate GCC's __SANITIZE_ADDRESS__ flag */ #define __SANITIZE_ADDRESS__ #endif Once hwaddress sanitizer was added to GCC, however, a separate define was created, __SANITIZE_HWADDRESS__. The kernel is expecting to find __SANITIZE_ADDRESS__ in either case, though, and the existing string macros break on supported architectures: #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(__SANITIZE_ADDRESS__) where as other architectures (like arm32) have no idea about hwaddress sanitizer and just check for __SANITIZE_ADDRESS__: #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) This would lead to compiler foritfy self-test warnings when building with CONFIG_KASAN_SW_TAGS=y: warning: unsafe memmove() usage lacked '__read_overflow2' symbol in lib/test_fortify/read_overflow2-memmove.c warning: unsafe memcpy() usage lacked '__write_overflow' symbol in lib/test_fortify/write_overflow-memcpy.c ... Sort this out by also defining __SANITIZE_ADDRESS__ in GCC under the hwaddress sanitizer. Suggested-by: Arnd Bergmann Cc: Nick Desaulniers Cc: Andrew Morton Cc: Will Deacon Cc: Arvind Sankar Cc: Masahiro Yamada Cc: llvm@lists.linux.dev Signed-off-by: Kees Cook Reviewed-by: Nathan Chancellor Acked-by: Miguel Ojeda Reviewed-by: Marco Elver Link: https://lore.kernel.org/r/20211020200039.170424-1-keescook@chromium.org --- include/linux/compiler-gcc.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 9957085b8148..7bbd8df02532 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -121,6 +121,14 @@ #define __no_sanitize_coverage #endif +/* + * Treat __SANITIZE_HWADDRESS__ the same as __SANITIZE_ADDRESS__ in the kernel, + * matching the defines used by Clang. + */ +#ifdef __SANITIZE_HWADDRESS__ +#define __SANITIZE_ADDRESS__ +#endif + /* * Turn individual warnings and errors on and off locally, depending * on version. -- cgit v1.2.3 From 3598b30bd970bbc09dba0cf31aa0a04105f37207 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Oct 2021 21:08:06 +0200 Subject: cpufreq: Fix typo in cpufreq.h s/internale/internal/ Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index ff88bb3e44fc..3317887027b5 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -385,7 +385,7 @@ struct cpufreq_driver { /* flags */ /* - * Set by drivers that need to update internale upper and lower boundaries along + * Set by drivers that need to update internal upper and lower boundaries along * with the target frequency and so the core and governors should also invoke * the diver if the target frequency does not change, but the policy min or max * may have changed. -- cgit v1.2.3 From e279317e9aeb11d8670e0a5acb10d50566eea9c9 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Fri, 15 Oct 2021 11:47:00 +0200 Subject: rpmsg: core: add API to get MTU Return the rpmsg buffer MTU for sending message, so rpmsg users can split a long message in several sub rpmsg buffers. Reviewed-by: Mathieu Poirier Reviewed-by: Bjorn Andersson Acked-by: Suman Anna Signed-off-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20211015094701.5732-2-arnaud.pouliquen@foss.st.com Signed-off-by: Greg Kroah-Hartman --- include/linux/rpmsg.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index d97dcd049f18..990b80fb49ad 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -186,6 +186,8 @@ int rpmsg_trysend_offchannel(struct rpmsg_endpoint *ept, u32 src, u32 dst, __poll_t rpmsg_poll(struct rpmsg_endpoint *ept, struct file *filp, poll_table *wait); +ssize_t rpmsg_get_mtu(struct rpmsg_endpoint *ept); + #else static inline int rpmsg_register_device(struct rpmsg_device *rpdev) @@ -296,6 +298,14 @@ static inline __poll_t rpmsg_poll(struct rpmsg_endpoint *ept, return 0; } +static inline ssize_t rpmsg_get_mtu(struct rpmsg_endpoint *ept) +{ + /* This shouldn't be possible */ + WARN_ON(1); + + return -ENXIO; +} + #endif /* IS_ENABLED(CONFIG_RPMSG) */ /* use a macro to avoid include chaining to get THIS_MODULE */ -- cgit v1.2.3 From 8ac33b8b6841e99a624ace543d92cbf598a91381 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Thu, 21 Oct 2021 19:35:40 +0900 Subject: counter: Fix use-after-free race condition for events_queue_size write A race condition is possible when writing to events_queue_size where the events kfifo is freed during the execution of a kfifo_in(), resulting in a use-after-free. This patch prevents such a scenario by protecting the events queue in operation with a spinlock and locking before performing the events queue size adjustment. The existing events_lock mutex is renamed to events_out_lock to reflect that it only protects events queue out operations. Because the events queue in operations can occur in an interrupt context, a new events_in_lock spinlock is introduced and utilized. Fixes: feff17a550c7 ("counter: Implement events_queue_size sysfs attribute") Cc: David Lechner Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/r/20211021103540.955639-1-vilhelm.gray@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/counter.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/counter.h b/include/linux/counter.h index 0fd99e255a50..b7d0a00a61cf 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -296,7 +296,8 @@ struct counter_ops { * @n_events_list_lock: lock to protect Counter next events list operations * @events: queue of detected Counter events * @events_wait: wait queue to allow blocking reads of Counter events - * @events_lock: lock to protect Counter events queue read operations + * @events_in_lock: lock to protect Counter events queue in operations + * @events_out_lock: lock to protect Counter events queue out operations * @ops_exist_lock: lock to prevent use during removal */ struct counter_device { @@ -323,7 +324,8 @@ struct counter_device { struct mutex n_events_list_lock; DECLARE_KFIFO_PTR(events, struct counter_event); wait_queue_head_t events_wait; - struct mutex events_lock; + spinlock_t events_in_lock; + struct mutex events_out_lock; struct mutex ops_exist_lock; }; -- cgit v1.2.3 From 992e5cc7be8e693aba1f0ee7905d34c87fd7872a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 20 Oct 2021 20:49:55 +0300 Subject: net: dsa: tag_8021q: make dsa_8021q_{rx,tx}_vid take dp as argument Pass a single argument to dsa_8021q_rx_vid and dsa_8021q_tx_vid that contains the necessary information from the two arguments that are currently provided: the switch and the port number. Also rename those functions so that they have a dsa_port_* prefix, since they operate on a struct dsa_port *. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index c7fa4a3498fe..254b165f2b44 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -9,6 +9,7 @@ #include struct dsa_switch; +struct dsa_port; struct sk_buff; struct net_device; @@ -45,9 +46,9 @@ void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, u16 dsa_8021q_bridge_tx_fwd_offload_vid(int bridge_num); -u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port); +u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp); -u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port); +u16 dsa_tag_8021q_rx_vid(const struct dsa_port *dp); int dsa_8021q_rx_switch_id(u16 vid); -- cgit v1.2.3 From 061514dbfb79910ef60eb40dd9fc528be3f45d62 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 18 Oct 2021 17:43:35 -0700 Subject: regulator: lp872x: Remove lp872x_dvs_state After this driver was converted to gpiod, clang started warning: vers/regulator/lp872x.c:689:57: error: implicit conversion from enumeration type 'enum lp872x_dvs_state' to different enumeration type 'enum gpiod_flags' [-Werror,-Wenum-conversion] dvs->gpio = devm_gpiod_get_optional(lp->dev, "ti,dvs", pinstate); ~~~~~~~~~~~~~~~~~~~~~~~ ^~~~~~~~ 1 error generated. lp872x_dvs_state was updated to have values from gpiod_flags but this is not enough to avoid an implicit conversion warning from either GCC or clang (although GCC enables this warning under -Wextra instead of -Wall like clang so it is not seen under normal builds). Eliminate lp872x_dvs_state in favor of using gpiod_flags everywhere so that there is no more warning about an implicit conversion. Fixes: 72bf80cf09c4 ("regulator: lp872x: replacing legacy gpio interface for gpiod") Link: https://github.com/ClangBuiltLinux/linux/issues/1481 Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20211019004335.193492-1-nathan@kernel.org Signed-off-by: Mark Brown --- include/linux/regulator/lp872x.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/lp872x.h b/include/linux/regulator/lp872x.h index 8e7e0343c6e1..b62e45aa1dd3 100644 --- a/include/linux/regulator/lp872x.h +++ b/include/linux/regulator/lp872x.h @@ -40,11 +40,6 @@ enum lp872x_regulator_id { LP872X_ID_MAX, }; -enum lp872x_dvs_state { - DVS_LOW = GPIOD_OUT_LOW, - DVS_HIGH = GPIOD_OUT_HIGH, -}; - enum lp872x_dvs_sel { SEL_V1, SEL_V2, @@ -59,7 +54,7 @@ enum lp872x_dvs_sel { struct lp872x_dvs { struct gpio_desc *gpio; enum lp872x_dvs_sel vsel; - enum lp872x_dvs_state init_state; + enum gpiod_flags init_state; }; /** -- cgit v1.2.3 From 6a8b5bb0f1350fc4cf398435a1119db12b0bd50e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Sun, 17 Oct 2021 15:06:39 -0300 Subject: regulator: tps62360: replacing legacy gpio interface for gpiod MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removing all linux/gpio.h and linux/of_gpio.h dependencies and replacing them with the gpiod interface. Signed-off-by: MaĂ­ra Canal Link: https://lore.kernel.org/r/YWxmL2baF5AdzyHv@fedora Signed-off-by: Mark Brown --- include/linux/regulator/tps62360.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/tps62360.h b/include/linux/regulator/tps62360.h index 94a90c06f1e5..398e74a1d941 100644 --- a/include/linux/regulator/tps62360.h +++ b/include/linux/regulator/tps62360.h @@ -19,10 +19,6 @@ * @en_discharge: Enable discharge the output capacitor via internal * register. * @en_internal_pulldn: internal pull down enable or not. - * @vsel0_gpio: Gpio number for vsel0. It should be -1 if this is tied with - * fixed logic. - * @vsel1_gpio: Gpio number for vsel1. It should be -1 if this is tied with - * fixed logic. * @vsel0_def_state: Default state of vsel0. 1 if it is high else 0. * @vsel1_def_state: Default state of vsel1. 1 if it is high else 0. */ @@ -30,8 +26,6 @@ struct tps62360_regulator_platform_data { struct regulator_init_data *reg_init_data; bool en_discharge; bool en_internal_pulldn; - int vsel0_gpio; - int vsel1_gpio; int vsel0_def_state; int vsel1_def_state; }; -- cgit v1.2.3 From 4570ddda43387e5a130dd85e71a1947b0c11da77 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 12 Mar 2021 14:04:07 +0100 Subject: powercap/drivers/dtpm: Encapsulate even more the code In order to increase the self-encapsulation of the dtpm generic code, the following changes are adding a power update ops to the dtpm ops. That allows the generic code to call directly the dtpm backend function to update the power values. The power update function does compute the power characteristics when the function is invoked. In the case of the CPUs, the power consumption depends on the number of online CPUs. The online CPUs mask is not up to date at CPUHP_AP_ONLINE_DYN state in the tear down callback. That is the reason why the online / offline are at separate state. As there is already an existing state for DTPM, this one is only moved to the DEAD state, so there is no addition of new state with these changes. The dtpm node is not removed when the cpu is unplugged. That simplifies the code for the next changes and results in a more self-encapsulated code. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20210312130411.29833-1-daniel.lezcano@linaro.org --- include/linux/cpuhotplug.h | 2 +- include/linux/dtpm.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 832d8a74fa59..ad9a34a80440 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -97,6 +97,7 @@ enum cpuhp_state { CPUHP_LUSTRE_CFS_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, CPUHP_PADATA_DEAD, + CPUHP_AP_DTPM_CPU_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, @@ -242,7 +243,6 @@ enum cpuhp_state { CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, - CPUHP_AP_DTPM_CPU_ONLINE, CPUHP_AP_ACTIVE, CPUHP_ONLINE, }; diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index e80a332e3d8a..acf8d3638988 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -29,6 +29,7 @@ struct dtpm { struct dtpm_ops { u64 (*set_power_uw)(struct dtpm *, u64); u64 (*get_power_uw)(struct dtpm *); + int (*update_power_uw)(struct dtpm *); void (*release)(struct dtpm *); }; @@ -62,7 +63,7 @@ static inline struct dtpm *to_dtpm(struct powercap_zone *zone) return container_of(zone, struct dtpm, zone); } -int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max); +int dtpm_update_power(struct dtpm *dtpm); int dtpm_release_zone(struct powercap_zone *pcz); -- cgit v1.2.3 From 7a89d7eacf8e84f2afb94db5ae9d9f9faa93f01c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 12 Mar 2021 14:04:09 +0100 Subject: powercap/drivers/dtpm: Simplify the dtpm table The dtpm table is an array of pointers, that forces the user of the table to define initdata along with the declaration of the table entry. It is more efficient to create an array of dtpm structure, so the declaration of the table entry can be done by initializing the different fields. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20210312130411.29833-3-daniel.lezcano@linaro.org --- include/linux/dtpm.h | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index acf8d3638988..1e53db6bd5f9 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -33,25 +33,23 @@ struct dtpm_ops { void (*release)(struct dtpm *); }; -struct dtpm_descr; - -typedef int (*dtpm_init_t)(struct dtpm_descr *); +typedef int (*dtpm_init_t)(void); struct dtpm_descr { - struct dtpm *parent; - const char *name; dtpm_init_t init; }; /* Init section thermal table */ -extern struct dtpm_descr *__dtpm_table[]; -extern struct dtpm_descr *__dtpm_table_end[]; +extern struct dtpm_descr __dtpm_table[]; +extern struct dtpm_descr __dtpm_table_end[]; -#define DTPM_TABLE_ENTRY(name) \ - static typeof(name) *__dtpm_table_entry_##name \ - __used __section("__dtpm_table") = &name +#define DTPM_TABLE_ENTRY(name, __init) \ + static struct dtpm_descr __dtpm_table_entry_##name \ + __used __section("__dtpm_table") = { \ + .init = __init, \ + } -#define DTPM_DECLARE(name) DTPM_TABLE_ENTRY(name) +#define DTPM_DECLARE(name, init) DTPM_TABLE_ENTRY(name, init) #define for_each_dtpm_table(__dtpm) \ for (__dtpm = __dtpm_table; \ -- cgit v1.2.3 From d2cdc6adc30879d81160199fc7c6ab890fc4bd4c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 12 Mar 2021 14:04:10 +0100 Subject: powercap/drivers/dtpm: Use container_of instead of a private data field The dtpm framework provides an API to allocate a dtpm node. However when a backend dtpm driver needs to allocate a dtpm node it must define its own structure and store the pointer of this structure in the private field of the dtpm structure. It is more elegant to use the container_of macro and add the dtpm structure inside the dtpm backend specific structure. The code will be able to deal properly with the dtpm structure as a generic entity, making all this even more self-encapsulated. The dtpm_alloc() function does no longer make sense as the dtpm structure will be allocated when allocating the device specific dtpm structure. The dtpm_init() is provided instead. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20210312130411.29833-4-daniel.lezcano@linaro.org --- include/linux/dtpm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index 1e53db6bd5f9..2890f6370eb9 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -23,7 +23,6 @@ struct dtpm { u64 power_max; u64 power_min; int weight; - void *private; }; struct dtpm_ops { @@ -65,7 +64,7 @@ int dtpm_update_power(struct dtpm *dtpm); int dtpm_release_zone(struct powercap_zone *pcz); -struct dtpm *dtpm_alloc(struct dtpm_ops *ops); +void dtpm_init(struct dtpm *dtpm, struct dtpm_ops *ops); void dtpm_unregister(struct dtpm *dtpm); -- cgit v1.2.3 From 3b13c168186c115501ee7d194460ba2f8c825155 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 21 Oct 2021 14:30:51 +0100 Subject: percpu_ref: percpu_ref_tryget_live() version holding RCU Add percpu_ref_tryget_live_rcu(), which is a version of percpu_ref_tryget_live() but the user is responsible for enclosing it in a RCU read lock section. Signed-off-by: Pavel Begunkov Acked-by: Dennis Zhou Link: https://lore.kernel.org/r/3066500d7a6eb3e03f10adf98b87fdb3b1c49db8.1634822969.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/percpu-refcount.h | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index ae16a9856305..b31d3f3312ce 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -266,6 +266,28 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) return percpu_ref_tryget_many(ref, 1); } +/** + * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the + * caller is responsible for taking RCU. + * + * This function is safe to call as long as @ref is between init and exit. + */ +static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref) +{ + unsigned long __percpu *percpu_count; + bool ret = false; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (likely(__ref_is_percpu(ref, &percpu_count))) { + this_cpu_inc(*percpu_count); + ret = true; + } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) { + ret = atomic_long_inc_not_zero(&ref->data->count); + } + return ret; +} + /** * percpu_ref_tryget_live - try to increment a live percpu refcount * @ref: percpu_ref to try-get @@ -283,20 +305,11 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { - unsigned long __percpu *percpu_count; bool ret = false; rcu_read_lock(); - - if (__ref_is_percpu(ref, &percpu_count)) { - this_cpu_inc(*percpu_count); - ret = true; - } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) { - ret = atomic_long_inc_not_zero(&ref->data->count); - } - + ret = percpu_ref_tryget_live_rcu(ref); rcu_read_unlock(); - return ret; } -- cgit v1.2.3 From f059a1d2e23a165bf86e33673c6a7535a08c6341 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Wed, 22 Sep 2021 20:37:08 +0800 Subject: block: Add invalidate_disk() helper to invalidate the gendisk To hide internal implementation and simplify some driver code, this adds a helper to invalidate the gendisk. It will clean the gendisk's associated buffer/page caches and reset its internal states. Signed-off-by: Xie Yongji Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210922123711.187-2-xieyongji@bytedance.com Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c70bc5fce4db..13f313ab99e7 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -213,6 +213,8 @@ static inline int add_disk(struct gendisk *disk) } extern void del_gendisk(struct gendisk *gp); +void invalidate_disk(struct gendisk *disk); + void set_disk_ro(struct gendisk *disk, bool read_only); static inline int get_disk_ro(struct gendisk *disk) -- cgit v1.2.3 From 1e8d44bddf57f6d878e083f281a34d5c88feb7db Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Oct 2021 11:04:51 -0700 Subject: blk-crypto: rename keyslot-manager files to blk-crypto-profile In preparation for renaming struct blk_keyslot_manager to struct blk_crypto_profile, rename the keyslot-manager.h and keyslot-manager.c source files. Renaming these files separately before making a lot of changes to their contents makes it easier for git to understand that they were renamed. Acked-by: Ulf Hansson # For MMC Reviewed-by: Christoph Hellwig Reviewed-by: Mike Snitzer Reviewed-by: Martin K. Petersen Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20211018180453.40441-3-ebiggers@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-crypto-profile.h | 120 +++++++++++++++++++++++++++++++++++++ include/linux/keyslot-manager.h | 120 ------------------------------------- include/linux/mmc/host.h | 2 +- 3 files changed, 121 insertions(+), 121 deletions(-) create mode 100644 include/linux/blk-crypto-profile.h delete mode 100644 include/linux/keyslot-manager.h (limited to 'include/linux') diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h new file mode 100644 index 000000000000..a27605e2f826 --- /dev/null +++ b/include/linux/blk-crypto-profile.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ + +#ifndef __LINUX_KEYSLOT_MANAGER_H +#define __LINUX_KEYSLOT_MANAGER_H + +#include +#include + +struct blk_keyslot_manager; + +/** + * struct blk_ksm_ll_ops - functions to manage keyslots in hardware + * @keyslot_program: Program the specified key into the specified slot in the + * inline encryption hardware. + * @keyslot_evict: Evict key from the specified keyslot in the hardware. + * The key is provided so that e.g. dm layers can evict + * keys from the devices that they map over. + * Returns 0 on success, -errno otherwise. + * + * This structure should be provided by storage device drivers when they set up + * a keyslot manager - this structure holds the function ptrs that the keyslot + * manager will use to manipulate keyslots in the hardware. + */ +struct blk_ksm_ll_ops { + int (*keyslot_program)(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot); + int (*keyslot_evict)(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot); +}; + +struct blk_keyslot_manager { + /* + * The struct blk_ksm_ll_ops that this keyslot manager will use + * to perform operations like programming and evicting keys on the + * device + */ + struct blk_ksm_ll_ops ksm_ll_ops; + + /* + * The maximum number of bytes supported for specifying the data unit + * number. + */ + unsigned int max_dun_bytes_supported; + + /* + * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents + * whether a crypto mode and data unit size are supported. The i'th + * bit of crypto_mode_supported[crypto_mode] is set iff a data unit + * size of (1 << i) is supported. We only support data unit sizes + * that are powers of 2. + */ + unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX]; + + /* Device for runtime power management (NULL if none) */ + struct device *dev; + + /* Here onwards are *private* fields for internal keyslot manager use */ + + unsigned int num_slots; + + /* Protects programming and evicting keys from the device */ + struct rw_semaphore lock; + + /* List of idle slots, with least recently used slot at front */ + wait_queue_head_t idle_slots_wait_queue; + struct list_head idle_slots; + spinlock_t idle_slots_lock; + + /* + * Hash table which maps struct *blk_crypto_key to keyslots, so that we + * can find a key's keyslot in O(1) time rather than O(num_slots). + * Protected by 'lock'. + */ + struct hlist_head *slot_hashtable; + unsigned int log_slot_ht_size; + + /* Per-keyslot data */ + struct blk_ksm_keyslot *slots; +}; + +int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots); + +int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm, + unsigned int num_slots); + +blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + struct blk_ksm_keyslot **slot_ptr); + +unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot); + +void blk_ksm_put_slot(struct blk_ksm_keyslot *slot); + +bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, + const struct blk_crypto_config *cfg); + +int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key); + +void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm); + +void blk_ksm_destroy(struct blk_keyslot_manager *ksm); + +void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, + const struct blk_keyslot_manager *child); + +void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm); + +bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, + struct blk_keyslot_manager *ksm_subset); + +void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, + struct blk_keyslot_manager *reference_ksm); + +#endif /* __LINUX_KEYSLOT_MANAGER_H */ diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h deleted file mode 100644 index a27605e2f826..000000000000 --- a/include/linux/keyslot-manager.h +++ /dev/null @@ -1,120 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright 2019 Google LLC - */ - -#ifndef __LINUX_KEYSLOT_MANAGER_H -#define __LINUX_KEYSLOT_MANAGER_H - -#include -#include - -struct blk_keyslot_manager; - -/** - * struct blk_ksm_ll_ops - functions to manage keyslots in hardware - * @keyslot_program: Program the specified key into the specified slot in the - * inline encryption hardware. - * @keyslot_evict: Evict key from the specified keyslot in the hardware. - * The key is provided so that e.g. dm layers can evict - * keys from the devices that they map over. - * Returns 0 on success, -errno otherwise. - * - * This structure should be provided by storage device drivers when they set up - * a keyslot manager - this structure holds the function ptrs that the keyslot - * manager will use to manipulate keyslots in the hardware. - */ -struct blk_ksm_ll_ops { - int (*keyslot_program)(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot); - int (*keyslot_evict)(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot); -}; - -struct blk_keyslot_manager { - /* - * The struct blk_ksm_ll_ops that this keyslot manager will use - * to perform operations like programming and evicting keys on the - * device - */ - struct blk_ksm_ll_ops ksm_ll_ops; - - /* - * The maximum number of bytes supported for specifying the data unit - * number. - */ - unsigned int max_dun_bytes_supported; - - /* - * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents - * whether a crypto mode and data unit size are supported. The i'th - * bit of crypto_mode_supported[crypto_mode] is set iff a data unit - * size of (1 << i) is supported. We only support data unit sizes - * that are powers of 2. - */ - unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX]; - - /* Device for runtime power management (NULL if none) */ - struct device *dev; - - /* Here onwards are *private* fields for internal keyslot manager use */ - - unsigned int num_slots; - - /* Protects programming and evicting keys from the device */ - struct rw_semaphore lock; - - /* List of idle slots, with least recently used slot at front */ - wait_queue_head_t idle_slots_wait_queue; - struct list_head idle_slots; - spinlock_t idle_slots_lock; - - /* - * Hash table which maps struct *blk_crypto_key to keyslots, so that we - * can find a key's keyslot in O(1) time rather than O(num_slots). - * Protected by 'lock'. - */ - struct hlist_head *slot_hashtable; - unsigned int log_slot_ht_size; - - /* Per-keyslot data */ - struct blk_ksm_keyslot *slots; -}; - -int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots); - -int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm, - unsigned int num_slots); - -blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - struct blk_ksm_keyslot **slot_ptr); - -unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot); - -void blk_ksm_put_slot(struct blk_ksm_keyslot *slot); - -bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, - const struct blk_crypto_config *cfg); - -int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key); - -void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm); - -void blk_ksm_destroy(struct blk_keyslot_manager *ksm); - -void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, - const struct blk_keyslot_manager *child); - -void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm); - -bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, - struct blk_keyslot_manager *ksm_subset); - -void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, - struct blk_keyslot_manager *reference_ksm); - -#endif /* __LINUX_KEYSLOT_MANAGER_H */ diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 0c0c9a0fdf57..725b1de41767 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include struct mmc_ios { unsigned int clock; /* clock rate */ -- cgit v1.2.3 From cb77cb5abe1f4fae4a33b735606aae22f9eaa1c7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Oct 2021 11:04:52 -0700 Subject: blk-crypto: rename blk_keyslot_manager to blk_crypto_profile blk_keyslot_manager is misnamed because it doesn't necessarily manage keyslots. It actually does several different things: - Contains the crypto capabilities of the device. - Provides functions to control the inline encryption hardware. Originally these were just for programming/evicting keyslots; however, new functionality (hardware-wrapped keys) will require new functions here which are unrelated to keyslots. Moreover, device-mapper devices already (ab)use "keyslot_evict" to pass key eviction requests to their underlying devices even though device-mapper devices don't have any keyslots themselves (so it really should be "evict_key", not "keyslot_evict"). - Sometimes (but not always!) it manages keyslots. Originally it always did, but device-mapper devices don't have keyslots themselves, so they use a "passthrough keyslot manager" which doesn't actually manage keyslots. This hack works, but the terminology is unnatural. Also, some hardware doesn't have keyslots and thus also uses a "passthrough keyslot manager" (support for such hardware is yet to be upstreamed, but it will happen eventually). Let's stop having keyslot managers which don't actually manage keyslots. Instead, rename blk_keyslot_manager to blk_crypto_profile. This is a fairly big change, since for consistency it also has to update keyslot manager-related function names, variable names, and comments -- not just the actual struct name. However it's still a fairly straightforward change, as it doesn't change any actual functionality. Acked-by: Ulf Hansson # For MMC Reviewed-by: Mike Snitzer Reviewed-by: Martin K. Petersen Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20211018180453.40441-4-ebiggers@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-crypto-profile.h | 164 ++++++++++++++++++++++++------------- include/linux/blk-mq.h | 2 +- include/linux/blkdev.h | 16 ++-- include/linux/device-mapper.h | 4 +- include/linux/mmc/host.h | 2 +- 5 files changed, 117 insertions(+), 71 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h index a27605e2f826..bbab65bd5428 100644 --- a/include/linux/blk-crypto-profile.h +++ b/include/linux/blk-crypto-profile.h @@ -3,67 +3,113 @@ * Copyright 2019 Google LLC */ -#ifndef __LINUX_KEYSLOT_MANAGER_H -#define __LINUX_KEYSLOT_MANAGER_H +#ifndef __LINUX_BLK_CRYPTO_PROFILE_H +#define __LINUX_BLK_CRYPTO_PROFILE_H #include #include -struct blk_keyslot_manager; +struct blk_crypto_profile; /** - * struct blk_ksm_ll_ops - functions to manage keyslots in hardware - * @keyslot_program: Program the specified key into the specified slot in the - * inline encryption hardware. - * @keyslot_evict: Evict key from the specified keyslot in the hardware. - * The key is provided so that e.g. dm layers can evict - * keys from the devices that they map over. - * Returns 0 on success, -errno otherwise. + * struct blk_crypto_ll_ops - functions to control inline encryption hardware * - * This structure should be provided by storage device drivers when they set up - * a keyslot manager - this structure holds the function ptrs that the keyslot - * manager will use to manipulate keyslots in the hardware. + * Low-level operations for controlling inline encryption hardware. This + * interface must be implemented by storage drivers that support inline + * encryption. All functions may sleep, are serialized by profile->lock, and + * are never called while profile->dev (if set) is runtime-suspended. */ -struct blk_ksm_ll_ops { - int (*keyslot_program)(struct blk_keyslot_manager *ksm, +struct blk_crypto_ll_ops { + + /** + * @keyslot_program: Program a key into the inline encryption hardware. + * + * Program @key into the specified @slot in the inline encryption + * hardware, overwriting any key that the keyslot may already contain. + * The keyslot is guaranteed to not be in-use by any I/O. + * + * This is required if the device has keyslots. Otherwise (i.e. if the + * device is a layered device, or if the device is real hardware that + * simply doesn't have the concept of keyslots) it is never called. + * + * Must return 0 on success, or -errno on failure. + */ + int (*keyslot_program)(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot); - int (*keyslot_evict)(struct blk_keyslot_manager *ksm, + + /** + * @keyslot_evict: Evict a key from the inline encryption hardware. + * + * If the device has keyslots, this function must evict the key from the + * specified @slot. The slot will contain @key, but there should be no + * need for the @key argument to be used as @slot should be sufficient. + * The keyslot is guaranteed to not be in-use by any I/O. + * + * If the device doesn't have keyslots itself, this function must evict + * @key from any underlying devices. @slot won't be valid in this case. + * + * If there are no keyslots and no underlying devices, this function + * isn't required. + * + * Must return 0 on success, or -errno on failure. + */ + int (*keyslot_evict)(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, unsigned int slot); }; -struct blk_keyslot_manager { - /* - * The struct blk_ksm_ll_ops that this keyslot manager will use - * to perform operations like programming and evicting keys on the - * device +/** + * struct blk_crypto_profile - inline encryption profile for a device + * + * This struct contains a storage device's inline encryption capabilities (e.g. + * the supported crypto algorithms), driver-provided functions to control the + * inline encryption hardware (e.g. programming and evicting keys), and optional + * device-independent keyslot management data. + */ +struct blk_crypto_profile { + + /* public: Drivers must initialize the following fields. */ + + /** + * @ll_ops: Driver-provided functions to control the inline encryption + * hardware, e.g. program and evict keys. */ - struct blk_ksm_ll_ops ksm_ll_ops; + struct blk_crypto_ll_ops ll_ops; - /* - * The maximum number of bytes supported for specifying the data unit - * number. + /** + * @max_dun_bytes_supported: The maximum number of bytes supported for + * specifying the data unit number (DUN). Specifically, the range of + * supported DUNs is 0 through (1 << (8 * max_dun_bytes_supported)) - 1. */ unsigned int max_dun_bytes_supported; - /* - * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents - * whether a crypto mode and data unit size are supported. The i'th - * bit of crypto_mode_supported[crypto_mode] is set iff a data unit - * size of (1 << i) is supported. We only support data unit sizes - * that are powers of 2. + /** + * @modes_supported: Array of bitmasks that specifies whether each + * combination of crypto mode and data unit size is supported. + * Specifically, the i'th bit of modes_supported[crypto_mode] is set if + * crypto_mode can be used with a data unit size of (1 << i). Note that + * only data unit sizes that are powers of 2 can be supported. */ - unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX]; + unsigned int modes_supported[BLK_ENCRYPTION_MODE_MAX]; - /* Device for runtime power management (NULL if none) */ + /** + * @dev: An optional device for runtime power management. If the driver + * provides this device, it will be runtime-resumed before any function + * in @ll_ops is called and will remain resumed during the call. + */ struct device *dev; - /* Here onwards are *private* fields for internal keyslot manager use */ + /* private: The following fields shouldn't be accessed by drivers. */ + /* Number of keyslots, or 0 if not applicable */ unsigned int num_slots; - /* Protects programming and evicting keys from the device */ + /* + * Serializes all calls to functions in @ll_ops as well as all changes + * to @slot_hashtable. This can also be taken in read mode to look up + * keyslots while ensuring that they can't be changed concurrently. + */ struct rw_semaphore lock; /* List of idle slots, with least recently used slot at front */ @@ -80,41 +126,41 @@ struct blk_keyslot_manager { unsigned int log_slot_ht_size; /* Per-keyslot data */ - struct blk_ksm_keyslot *slots; + struct blk_crypto_keyslot *slots; }; -int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots); - -int devm_blk_ksm_init(struct device *dev, struct blk_keyslot_manager *ksm, - unsigned int num_slots); +int blk_crypto_profile_init(struct blk_crypto_profile *profile, + unsigned int num_slots); -blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - struct blk_ksm_keyslot **slot_ptr); +int devm_blk_crypto_profile_init(struct device *dev, + struct blk_crypto_profile *profile, + unsigned int num_slots); -unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot); +unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot); -void blk_ksm_put_slot(struct blk_ksm_keyslot *slot); +blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + struct blk_crypto_keyslot **slot_ptr); -bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, - const struct blk_crypto_config *cfg); +void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); -int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key); +bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, + const struct blk_crypto_config *cfg); -void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm); +int __blk_crypto_evict_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key); -void blk_ksm_destroy(struct blk_keyslot_manager *ksm); +void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile); -void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, - const struct blk_keyslot_manager *child); +void blk_crypto_profile_destroy(struct blk_crypto_profile *profile); -void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm); +void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, + const struct blk_crypto_profile *child); -bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, - struct blk_keyslot_manager *ksm_subset); +bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target, + const struct blk_crypto_profile *reference); -void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, - struct blk_keyslot_manager *reference_ksm); +void blk_crypto_update_capabilities(struct blk_crypto_profile *dst, + const struct blk_crypto_profile *src); -#endif /* __LINUX_KEYSLOT_MANAGER_H */ +#endif /* __LINUX_BLK_CRYPTO_PROFILE_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e13780236550..b4039fdf1b04 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -133,7 +133,7 @@ struct request { #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; - struct blk_ksm_keyslot *crypt_keyslot; + struct blk_crypto_keyslot *crypt_keyslot; #endif unsigned short write_hint; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c7b1e9355123..f72ccb2829db 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -30,7 +30,7 @@ struct pr_ops; struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; -struct blk_keyslot_manager; +struct blk_crypto_profile; /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 @@ -224,8 +224,7 @@ struct request_queue { unsigned int dma_alignment; #ifdef CONFIG_BLK_INLINE_ENCRYPTION - /* Inline crypto capabilities */ - struct blk_keyslot_manager *ksm; + struct blk_crypto_profile *crypto_profile; #endif unsigned int rq_timeout; @@ -1142,19 +1141,20 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned lo #ifdef CONFIG_BLK_INLINE_ENCRYPTION -bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q); +bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q); -void blk_ksm_unregister(struct request_queue *q); +void blk_crypto_unregister(struct request_queue *q); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm, - struct request_queue *q) +static inline bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q) { return true; } -static inline void blk_ksm_unregister(struct request_queue *q) { } +static inline void blk_crypto_unregister(struct request_queue *q) { } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 114553b487ef..a7df155ea49b 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -576,9 +576,9 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *t); /* - * Table keyslot manager functions + * Table blk_crypto_profile functions */ -void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm); +void dm_destroy_crypto_profile(struct blk_crypto_profile *profile); /*----------------------------------------------------------------- * Macros. diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 725b1de41767..52eae8c45b8d 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -492,7 +492,7 @@ struct mmc_host { /* Inline encryption support */ #ifdef CONFIG_MMC_CRYPTO - struct blk_keyslot_manager ksm; + struct blk_crypto_profile crypto_profile; #endif /* Host Software Queue support */ -- cgit v1.2.3 From f64dd4627ec6edc39bf1430fe6dbc923d2300a88 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:34 +0200 Subject: ftrace: Add multi direct register/unregister interface Adding interface to register multiple direct functions within single call. Adding following functions: register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) The register_ftrace_direct_multi registers direct function (addr) with all functions in ops filter. The ops filter can be updated before with ftrace_set_filter_ip calls. All requested functions must not have direct function currently registered, otherwise register_ftrace_direct_multi will fail. The unregister_ftrace_direct_multi unregisters ops related direct functions. Link: https://lkml.kernel.org/r/20211008091336.33616-7-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 16a7baaba702..0158261cac9f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -324,7 +324,10 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long old_addr, unsigned long new_addr); unsigned long ftrace_find_rec_direct(unsigned long ip); +int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); +int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); #else +struct ftrace_ops; # define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) { @@ -354,6 +357,14 @@ static inline unsigned long ftrace_find_rec_direct(unsigned long ip) { return 0; } +static inline int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + return -ENODEV; +} +static inline int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + return -ENODEV; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -- cgit v1.2.3 From ccf5a89efd6f0a9483cea8acd4a0822b1a47e59a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:35 +0200 Subject: ftrace: Add multi direct modify interface Adding interface to modify registered direct function for ftrace_ops. Adding following function: modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) The function changes the currently registered direct function for all attached functions. Link: https://lkml.kernel.org/r/20211008091336.33616-8-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0158261cac9f..9999e29187de 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -326,6 +326,8 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long ftrace_find_rec_direct(unsigned long ip); int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); +int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); + #else struct ftrace_ops; # define ftrace_direct_func_count 0 @@ -365,6 +367,10 @@ static inline int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigne { return -ENODEV; } +static inline int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + return -ENODEV; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -- cgit v1.2.3 From bce5c81cb31f7f124ce231ec79df9e85a8bac132 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 19 Oct 2021 09:25:20 -0400 Subject: tracing: Explain the trace recursion transition bit better The current text of the explanation of the transition bit in the trace recursion protection is not very clear. Improve the text, so that when all the archs no longer have the issue of tracing between a start of a new (interrupt) context and updating the preempt_count to reflect the new context, that it may be removed. Link: https://lore.kernel.org/all/20211018220203.064a42ed@gandalf.local.home/ Suggested-by: Petr Mladek Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_recursion.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 1d8cce02c3fb..24f284eb55a7 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -168,8 +168,12 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign bit = trace_get_context_bit() + start; if (unlikely(val & (1 << bit))) { /* - * It could be that preempt_count has not been updated during - * a switch between contexts. Allow for a single recursion. + * If an interrupt occurs during a trace, and another trace + * happens in that interrupt but before the preempt_count is + * updated to reflect the new interrupt context, then this + * will think a recursion occurred, and the event will be dropped. + * Let a single instance happen via the TRANSITION_BIT to + * not drop those events. */ bit = TRACE_TRANSITION_BIT; if (val & (1 << bit)) { -- cgit v1.2.3 From 17b5b576ff5faff99a4c8140d521cd4d7fff5c16 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 7 Oct 2021 15:46:39 +0200 Subject: mux: add support for delay after muxing Hardware may require some time for the muxed analog signals to settle after the muxing is changed. Allow users of the mux subsystem to specify this delay with the new mux_control_select_delay() function (and the _try equivalent). Signed-off-by: Vincent Whitchurch Reviewed-by: Lars-Peter Clausen Tested-by: Lars-Peter Clausen Acked-by: Peter Rosin Link: https://lore.kernel.org/r/20211007134641.13417-2-vincent.whitchurch@axis.com Signed-off-by: Jonathan Cameron --- include/linux/mux/consumer.h | 23 +++++++++++++++++++---- include/linux/mux/driver.h | 4 ++++ 2 files changed, 23 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mux/consumer.h b/include/linux/mux/consumer.h index 5fc6bb2fefad..7a09b040ac39 100644 --- a/include/linux/mux/consumer.h +++ b/include/linux/mux/consumer.h @@ -16,10 +16,25 @@ struct device; struct mux_control; unsigned int mux_control_states(struct mux_control *mux); -int __must_check mux_control_select(struct mux_control *mux, - unsigned int state); -int __must_check mux_control_try_select(struct mux_control *mux, - unsigned int state); +int __must_check mux_control_select_delay(struct mux_control *mux, + unsigned int state, + unsigned int delay_us); +int __must_check mux_control_try_select_delay(struct mux_control *mux, + unsigned int state, + unsigned int delay_us); + +static inline int __must_check mux_control_select(struct mux_control *mux, + unsigned int state) +{ + return mux_control_select_delay(mux, state, 0); +} + +static inline int __must_check mux_control_try_select(struct mux_control *mux, + unsigned int state) +{ + return mux_control_try_select_delay(mux, state, 0); +} + int mux_control_deselect(struct mux_control *mux); struct mux_control *mux_control_get(struct device *dev, const char *mux_name); diff --git a/include/linux/mux/driver.h b/include/linux/mux/driver.h index 627a2c6bc02d..18824064f8c0 100644 --- a/include/linux/mux/driver.h +++ b/include/linux/mux/driver.h @@ -12,6 +12,7 @@ #include #include +#include #include struct mux_chip; @@ -33,6 +34,7 @@ struct mux_control_ops { * @states: The number of mux controller states. * @idle_state: The mux controller state to use when inactive, or one * of MUX_IDLE_AS_IS and MUX_IDLE_DISCONNECT. + * @last_change: Timestamp of last change * * Mux drivers may only change @states and @idle_state, and may only do so * between allocation and registration of the mux controller. Specifically, @@ -47,6 +49,8 @@ struct mux_control { unsigned int states; int idle_state; + + ktime_t last_change; }; /** -- cgit v1.2.3 From 9eeb3aa33ae005526f672b394c1791578463513f Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Thu, 21 Oct 2021 21:47:51 +0800 Subject: bpf: Add bpf_skc_to_unix_sock() helper The helper is used in tracing programs to cast a socket pointer to a unix_sock pointer. The return value could be NULL if the casting is illegal. Suggested-by: Yonghong Song Signed-off-by: Hengqi Chen Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211021134752.1223426-2-hengqi.chen@gmail.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d604c8251d88..be3102b4554b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2093,6 +2093,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_snprintf_proto; -- cgit v1.2.3 From d08fd747d0ed682b6c6c280ba454cafcad33563d Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 14 Oct 2021 16:35:11 +0200 Subject: Compiler Attributes: remove GCC 5.1 mention GCC 5.1 is now the minimum version. Acked-by: Nick Desaulniers Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index e6ec63403965..87d1e773400c 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -104,7 +104,6 @@ #define __deprecated /* - * Optional: only supported since gcc >= 5.1 * Optional: not supported by clang * Optional: not supported by icc * -- cgit v1.2.3 From aba64c7da98330141dcdadd5612f088043a83696 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Wed, 20 Oct 2021 00:48:17 -0700 Subject: bpf: Add verified_insns to bpf_prog_info and fdinfo This stat is currently printed in the verifier log and not stored anywhere. To ease consumption of this data, add a field to bpf_prog_aux so it can be exposed via BPF_OBJ_GET_INFO_BY_FD and fdinfo. Signed-off-by: Dave Marchevsky Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20211020074818.1017682-2-davemarchevsky@fb.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be3102b4554b..31421c74ba08 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -887,6 +887,7 @@ struct bpf_prog_aux { struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ + u32 verified_insns; struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; #ifdef CONFIG_SECURITY -- cgit v1.2.3 From 7c00621dcaeea206d7489b3e8b50b1864841ae69 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 14 Oct 2021 15:23:31 +0200 Subject: compiler_types: mark __compiletime_assert failure as __noreturn `__compiletime_assert` declares a fake `extern` function which appears (to the compiler) to be called when the test fails. Therefore, compilers may emit possibly-uninitialized warnings in some cases, even if it will be an error anyway (for compilers supporting the `error` attribute, e.g. GCC and Clang >= 14) or a link failure (for those that do not, e.g. Clang < 14). Annotating the fake function as `__noreturn` gives them the information they need to avoid the warning, e.g. see https://godbolt.org/z/x1v69jjYY. Link: https://lore.kernel.org/llvm/202110100514.3h9CI4s0-lkp@intel.com/ Reported-by: kernel test robot Reviewed-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Miguel Ojeda --- include/linux/compiler_types.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index b6ff83a714ca..ca1a66b8cd2f 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -298,7 +298,13 @@ struct ftrace_likely_data { #ifdef __OPTIMIZE__ # define __compiletime_assert(condition, msg, prefix, suffix) \ do { \ - extern void prefix ## suffix(void) __compiletime_error(msg); \ + /* \ + * __noreturn is needed to give the compiler enough \ + * information to avoid certain possibly-uninitialized \ + * warnings (regardless of the build failing). \ + */ \ + __noreturn extern void prefix ## suffix(void) \ + __compiletime_error(msg); \ if (!(condition)) \ prefix ## suffix(); \ } while (0) -- cgit v1.2.3 From 008d3825a805557464c5e75f9eb806a3aa2f5e6d Mon Sep 17 00:00:00 2001 From: Eddie James Date: Tue, 19 Oct 2021 15:53:04 -0500 Subject: fsi: occ: Use a large buffer for responses Allocate a large buffer for each OCC to handle response data. This removes memory allocation during an operation, and also allows for the maximum amount of SBE FFDC. Previously for the putsram and attn commands, only 32 words would have been available, and for getsram, only up to the size of the transfer. SBE FFDC might be up to 8Kb. The SBE interface expects data to be specified in units of words (4 bytes), defined as OCC_MAX_RESP_WORDS. This change allows the full FFDC capture to be implemented, where before it was not available. Signed-off-by: Eddie James Link: https://lore.kernel.org/r/20211019205307.36946-2-eajames@linux.ibm.com Signed-off-by: Joel Stanley --- include/linux/fsi-occ.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsi-occ.h b/include/linux/fsi-occ.h index d4cdc2aa6e33..7ee3dbd7f4b3 100644 --- a/include/linux/fsi-occ.h +++ b/include/linux/fsi-occ.h @@ -19,6 +19,8 @@ struct device; #define OCC_RESP_CRIT_OCB 0xE3 #define OCC_RESP_CRIT_HW 0xE4 +#define OCC_MAX_RESP_WORDS 2048 + int fsi_occ_submit(struct device *dev, const void *request, size_t req_len, void *response, size_t *resp_len); -- cgit v1.2.3 From dc0fd0acb6e0e8025a0a43ada54513b216254fac Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Thu, 21 Oct 2021 15:09:03 +0200 Subject: HID: surface-hid: Use correct event registry for managing HID events Until now, we have only ever seen the REG-category registry being used on devices addressed with target ID 2. In fact, we have only ever seen Surface Aggregator Module (SAM) HID devices with target ID 2. For those devices, the registry also has to be addressed with target ID 2. Some devices, like the new Surface Laptop Studio, however, address their HID devices on target ID 1. As a result of this, any target ID 2 commands time out. This includes event management commands addressed to the target ID 2 REG-category registry. For these devices, the registry has to be addressed via target ID 1 instead. We therefore assume that the target ID of the registry to be used depends on the target ID of the respective device. Implement this accordingly. Note that we currently allow the surface HID driver to only load against devices with target ID 2, so these timeouts are not happening (yet). This is just a preparation step before we allow the driver to load against all target IDs. Cc: stable@vger.kernel.org # 5.14+ Signed-off-by: Maximilian Luz Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20211021130904.862610-3-luzmaximilian@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/surface_aggregator/controller.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/surface_aggregator/controller.h b/include/linux/surface_aggregator/controller.h index 068e1982ad37..74bfdffaf7b0 100644 --- a/include/linux/surface_aggregator/controller.h +++ b/include/linux/surface_aggregator/controller.h @@ -792,8 +792,8 @@ enum ssam_event_mask { #define SSAM_EVENT_REGISTRY_KIP \ SSAM_EVENT_REGISTRY(SSAM_SSH_TC_KIP, 0x02, 0x27, 0x28) -#define SSAM_EVENT_REGISTRY_REG \ - SSAM_EVENT_REGISTRY(SSAM_SSH_TC_REG, 0x02, 0x01, 0x02) +#define SSAM_EVENT_REGISTRY_REG(tid)\ + SSAM_EVENT_REGISTRY(SSAM_SSH_TC_REG, tid, 0x01, 0x02) /** * enum ssam_event_notifier_flags - Flags for event notifiers. -- cgit v1.2.3 From dd66f56caea6bb1a3703fb3bfc3106444d05a930 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 21 Oct 2021 08:55:24 +0200 Subject: dma-buf: fix kerneldoc for renamed members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those members where renamed, update the kerneldoc as well. Signed-off-by: Christian König Reviewed-by: Alex Deucher Acked-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20211021141945.84023-1-christian.koenig@amd.com --- include/linux/dma-buf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 02c2eb874da6..9807aef33685 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -433,8 +433,8 @@ struct dma_buf { /** @poll: for userspace poll support */ wait_queue_head_t poll; - /** @cb_excl: for userspace poll support */ - /** @cb_shared: for userspace poll support */ + /** @cb_in: for userspace poll support */ + /** @cb_out: for userspace poll support */ struct dma_buf_poll_cb_t { struct dma_fence_cb cb; wait_queue_head_t *poll; -- cgit v1.2.3 From 48d09e97876bed4bcc503d528bdba8c907e43cb3 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 21 Oct 2021 08:58:34 -0700 Subject: firmware_loader: formalize built-in firmware API Formalize the built-in firmware with a proper API. This can later be used by other callers where all they need is built-in firmware. We export the firmware_request_builtin() call for now only under the TEST_FIRMWARE symbol namespace as there are no direct modular users for it. If they pop up they are free to export it generally. Built-in code always gets access to the callers and we'll demonstrate a hidden user which has been lurking in the kernel for a while and the reason why using a proper API was better long term. Reviewed-by: Borislav Petkov Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20211021155843.1969401-2-mcgrof@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware.h b/include/linux/firmware.h index 25109192cebe..d743a8d1c2fe 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -20,12 +20,19 @@ struct firmware { struct module; struct device; +/* + * Built-in firmware functionality is only available if FW_LOADER=y, but not + * FW_LOADER=m + */ +#ifdef CONFIG_FW_LOADER struct builtin_fw { char *name; void *data; unsigned long size; }; +bool firmware_request_builtin(struct firmware *fw, const char *name); + /* We have to play tricks here much like stringify() to get the __COUNTER__ macro to be expanded as we want it */ #define __fw_concat1(x, y) x##y @@ -38,6 +45,14 @@ struct builtin_fw { static const struct builtin_fw __fw_concat(__builtin_fw,__COUNTER__) \ __used __section(".builtin_fw") = { name, blob, size } +#else +static inline bool firmware_request_builtin(struct firmware *fw, + const char *name) +{ + return false; +} +#endif + #if defined(CONFIG_FW_LOADER) || (defined(CONFIG_FW_LOADER_MODULE) && defined(MODULE)) int request_firmware(const struct firmware **fw, const char *name, struct device *device); -- cgit v1.2.3 From e520ecf4546fdaa7169ba75a35d24e2c53403a6e Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 21 Oct 2021 08:58:35 -0700 Subject: firmware_loader: remove old DECLARE_BUILTIN_FIRMWARE() This was never used upstream. Time to get rid of it. We don't carry around unused baggage. Reviewed-by: Borislav Petkov Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20211021155843.1969401-3-mcgrof@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware.h b/include/linux/firmware.h index d743a8d1c2fe..34e8d5844fa0 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -32,19 +32,6 @@ struct builtin_fw { }; bool firmware_request_builtin(struct firmware *fw, const char *name); - -/* We have to play tricks here much like stringify() to get the - __COUNTER__ macro to be expanded as we want it */ -#define __fw_concat1(x, y) x##y -#define __fw_concat(x, y) __fw_concat1(x, y) - -#define DECLARE_BUILTIN_FIRMWARE(name, blob) \ - DECLARE_BUILTIN_FIRMWARE_SIZE(name, &(blob), sizeof(blob)) - -#define DECLARE_BUILTIN_FIRMWARE_SIZE(name, blob, size) \ - static const struct builtin_fw __fw_concat(__builtin_fw,__COUNTER__) \ - __used __section(".builtin_fw") = { name, blob, size } - #else static inline bool firmware_request_builtin(struct firmware *fw, const char *name) -- cgit v1.2.3 From e2e2c0f20f321b0ec36e8bde467259c0adf1fecb Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 21 Oct 2021 08:58:37 -0700 Subject: firmware_loader: move struct builtin_fw to the only place used Now that x86 doesn't abuse picking at internals to the firmware loader move out the built-in firmware struct to its only user. Reviewed-by: Borislav Petkov Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20211021155843.1969401-5-mcgrof@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware.h b/include/linux/firmware.h index 34e8d5844fa0..3b057dfc8284 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -25,12 +25,6 @@ struct device; * FW_LOADER=m */ #ifdef CONFIG_FW_LOADER -struct builtin_fw { - char *name; - void *data; - unsigned long size; -}; - bool firmware_request_builtin(struct firmware *fw, const char *name); #else static inline bool firmware_request_builtin(struct firmware *fw, -- cgit v1.2.3 From 9208d414975895f69e9aca49153060ddd31b18d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Oct 2021 08:06:01 +0200 Subject: block: add a ->get_unique_id method Add a method to query unique IDs from block devices. It will be used to remove code that deeply pokes into SCSI internals in the NFS server. The implementation in the sd driver itself is also much nicer as it can use the cached VPD page instead of always sending a command as the current NFS code does. For now the interface is kept very minimal but could be easily extended when other users like a block-layer sysfs interface for uniquue IDs shows up. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20211021060607.264371-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f72ccb2829db..0d5826066e16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1158,6 +1158,14 @@ static inline void blk_crypto_unregister(struct request_queue *q) { } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ +enum blk_unique_id { + /* these match the Designator Types specified in SPC */ + BLK_UID_T10 = 1, + BLK_UID_EUI64 = 2, + BLK_UID_NAA = 3, +}; + +#define NFL4_UFLG_MASK 0x0000003F struct block_device_operations { void (*submit_bio)(struct bio *bio); @@ -1176,6 +1184,9 @@ struct block_device_operations { int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); char *(*devnode)(struct gendisk *disk, umode_t *mode); + /* returns the length of the identifier or a negative errno: */ + int (*get_unique_id)(struct gendisk *disk, u8 id[16], + enum blk_unique_id id_type); struct module *owner; const struct pr_ops *pr_ops; -- cgit v1.2.3 From 4abafdc4360d993104c2b2f85943938a0c6ad025 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Oct 2021 08:06:06 +0200 Subject: block: remove the initialize_rq_fn blk_mq_ops method Entirely unused now. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20211021060607.264371-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b4039fdf1b04..ebc45cf0450b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -566,11 +566,6 @@ struct blk_mq_ops { void (*exit_request)(struct blk_mq_tag_set *set, struct request *, unsigned int); - /** - * @initialize_rq_fn: Called from inside blk_get_request(). - */ - void (*initialize_rq_fn)(struct request *rq); - /** * @cleanup_rq: Called before freeing one request which isn't completed * yet, and usually for freeing the driver private data. -- cgit v1.2.3 From 4845012eb5b4e56cadb5f484cb55dd4fd9d1df80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Oct 2021 08:06:07 +0200 Subject: block: remove QUEUE_FLAG_SCSI_PASSTHROUGH Export scsi_device_from_queue for use with pktcdvd and use that instead of the otherwise unused QUEUE_FLAG_SCSI_PASSTHROUGH queue flag. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20211021060607.264371-8-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0d5826066e16..1ad30f85d30e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -357,7 +357,6 @@ struct request_queue { #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_POLL_STATS 21 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ -#define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ @@ -391,8 +390,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) -#define blk_queue_scsi_passthrough(q) \ - test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) #ifdef CONFIG_BLK_RQ_ALLOC_TIME -- cgit v1.2.3 From 70164eb6ccb76ab679b016b4b60123bf4ec6c162 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Oct 2021 08:25:25 +0200 Subject: block: remove __sync_blockdev Instead offer a new sync_blockdev_nowait helper for the !wait case. This new helper is exported as it will grow modular callers in a bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211019062530.2174626-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f72ccb2829db..4b5a6bbdacd0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1266,6 +1266,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); +int sync_blockdev_nowait(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -1274,6 +1275,10 @@ static inline int sync_blockdev(struct block_device *bdev) { return 0; } +static inline int sync_blockdev_nowait(struct block_device *bdev) +{ + return 0; +} #endif int fsync_bdev(struct block_device *bdev); -- cgit v1.2.3 From 1e03a36bdff4709c1bbf0f57f60ae3f776d51adf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Oct 2021 08:25:30 +0200 Subject: block: simplify the block device syncing code Get rid of the indirections and just provide a sync_bdevs helper for the generic sync code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211019062530.2174626-8-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4b5a6bbdacd0..09fdf7018b7f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1267,6 +1267,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); int sync_blockdev_nowait(struct block_device *bdev); +void sync_bdevs(bool wait); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -1279,6 +1280,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) { return 0; } +static inline void sync_bdevs(bool wait) +{ +} #endif int fsync_bdev(struct block_device *bdev); -- cgit v1.2.3 From fadb7ff1a6c2c565af56b4aacdd086b067eed440 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 14 Oct 2021 15:25:53 +0100 Subject: bpf: Prevent increasing bpf_jit_limit above max Restrict bpf_jit_limit to the maximum supported by the arch's JIT. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211014142554.53120-4-lmb@cloudflare.com --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4a93c12543ee..ef03ff34234d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1051,6 +1051,7 @@ extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; extern long bpf_jit_limit; +extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); -- cgit v1.2.3 From 599593a82fc57f5e9453c8ef7420df3206934a0c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 Oct 2021 19:35:45 -0600 Subject: sched: make task_struct->plug always defined MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_BLOCK isn't set, then it's an empty struct anyway. Just make it generally available, so we don't break the compile: kernel/sched/core.c: In function ‘sched_submit_work’: kernel/sched/core.c:6346:35: error: ‘struct task_struct’ has no member named ‘plug’ 6346 | blk_flush_plug(tsk->plug, true); | ^~ kernel/sched/core.c: In function ‘io_schedule_prepare’: kernel/sched/core.c:8357:20: error: ‘struct task_struct’ has no member named ‘plug’ 8357 | if (current->plug) | ^~ kernel/sched/core.c:8358:39: error: ‘struct task_struct’ has no member named ‘plug’ 8358 | blk_flush_plug(current->plug, true); | ^~ Reported-by: Nathan Chancellor Fixes: 008f75a20e70 ("block: cleanup the flush plug helpers") Signed-off-by: Jens Axboe --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c1a927ddec64..e0454e60fe8f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1160,10 +1160,8 @@ struct task_struct { /* Stacked block device info: */ struct bio_list *bio_list; -#ifdef CONFIG_BLOCK /* Stack plugging: */ struct blk_plug *plug; -#endif /* VM state: */ struct reclaim_state *reclaim_state; -- cgit v1.2.3 From 0ebecb2644c834d8a69f07c5d44a130835950171 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 22 Oct 2021 11:59:12 -0400 Subject: net: mdio: Add helper functions for accessing MDIO devices This adds some helpers for accessing non-phy MDIO devices. They are analogous to phy_(read|write|modify), except that they take an mdio_device and not a phy_device. Signed-off-by: Sean Anderson Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/mdio.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index f622888a4ba8..9f3587a61e14 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -352,6 +352,30 @@ int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask, int mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum, u16 mask, u16 set); +static inline int mdiodev_read(struct mdio_device *mdiodev, u32 regnum) +{ + return mdiobus_read(mdiodev->bus, mdiodev->addr, regnum); +} + +static inline int mdiodev_write(struct mdio_device *mdiodev, u32 regnum, + u16 val) +{ + return mdiobus_write(mdiodev->bus, mdiodev->addr, regnum, val); +} + +static inline int mdiodev_modify(struct mdio_device *mdiodev, u32 regnum, + u16 mask, u16 set) +{ + return mdiobus_modify(mdiodev->bus, mdiodev->addr, regnum, mask, set); +} + +static inline int mdiodev_modify_changed(struct mdio_device *mdiodev, + u32 regnum, u16 mask, u16 set) +{ + return mdiobus_modify_changed(mdiodev->bus, mdiodev->addr, regnum, + mask, set); +} + static inline u32 mdiobus_c45_addr(int devad, u16 regnum) { return MII_ADDR_C45 | devad << MII_DEVADDR_C45_SHIFT | regnum; -- cgit v1.2.3 From 218f23e8a96fea7185dac68ceb3722d0831a8bcb Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 22 Oct 2021 09:17:01 -0700 Subject: net: phy: bcm7xxx: Add EPHY entry for 7712 7712 is a 16nm process SoC with a 10/100 integrated Ethernet PHY, utilize the recently defined 16nm EPHY macro to configure that PHY. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 27d9b6683f0e..747fad264033 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -50,6 +50,7 @@ #define PHY_ID_BCM7439 0x600d8480 #define PHY_ID_BCM7439_2 0xae025080 #define PHY_ID_BCM7445 0x600d8510 +#define PHY_ID_BCM7712 0x35905330 #define PHY_ID_BCM_CYGNUS 0xae025200 #define PHY_ID_BCM_OMEGA 0xae025100 -- cgit v1.2.3 From 97308f8b0d867e9ef59528cd97f0db55ffdf5651 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 23 Jul 2021 01:59:41 +0200 Subject: iomap: Support partial direct I/O on user copy failures In iomap_dio_rw, when iomap_apply returns an -EFAULT error and the IOMAP_DIO_PARTIAL flag is set, complete the request synchronously and return a partial result. This allows the caller to deal with the page fault and retry the remainder of the request. Signed-off-by: Andreas Gruenbacher Reviewed-by: Darrick J. Wong --- include/linux/iomap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 24f8489583ca..2a213b0d1e1f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -330,6 +330,13 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_OVERWRITE_ONLY (1 << 1) +/* + * When a page fault occurs, return a partial synchronous result and allow + * the caller to retry the rest of the operation after dealing with the page + * fault. + */ +#define IOMAP_DIO_PARTIAL (1 << 2) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags); -- cgit v1.2.3 From 4fdccaa0d184c202f98d73b24e3ec8eeee88ab8d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 24 Jul 2021 12:26:41 +0200 Subject: iomap: Add done_before argument to iomap_dio_rw Add a done_before argument to iomap_dio_rw that indicates how much of the request has already been transferred. When the request succeeds, we report that done_before additional bytes were tranferred. This is useful for finishing a request asynchronously when part of the request has already been completed synchronously. We'll use that to allow iomap_dio_rw to be used with page faults disabled: when a page fault occurs while submitting a request, we synchronously complete the part of the request that has already been submitted. The caller can then take care of the page fault and call iomap_dio_rw again for the rest of the request, passing in the number of bytes already tranferred. Signed-off-by: Andreas Gruenbacher Reviewed-by: Darrick J. Wong --- include/linux/iomap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 2a213b0d1e1f..829f2325ecba 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -339,10 +339,10 @@ struct iomap_dio_ops { ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags); + unsigned int dio_flags, size_t done_before); struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags); + unsigned int dio_flags, size_t done_before); ssize_t iomap_dio_complete(struct iomap_dio *dio); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); -- cgit v1.2.3 From 55b8fe703bc51200d4698596c90813453b35ae63 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 17 Aug 2021 22:52:08 +0200 Subject: gup: Introduce FOLL_NOFAULT flag to disable page faults Introduce a new FOLL_NOFAULT flag that causes get_user_pages to return -EFAULT when it would otherwise trigger a page fault. This is roughly similar to FOLL_FAST_ONLY but available on all architectures, and less fragile. Signed-off-by: Andreas Gruenbacher --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 73a52aba448f..2f0e6b9f8f3b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2851,7 +2851,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ -#define FOLL_POPULATE 0x40 /* fault in page */ +#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */ +#define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ -- cgit v1.2.3 From 3337ab08d08b1a375f88471d9c8b1cac968cb054 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 12 Jul 2021 12:06:14 +0200 Subject: iov_iter: Introduce nofault flag to disable page faults Introduce a new nofault flag to indicate to iov_iter_get_pages not to fault in user pages. This is implemented by passing the FOLL_NOFAULT flag to get_user_pages, which causes get_user_pages to fail when it would otherwise fault in a page. We'll use the ->nofault flag to prevent iomap_dio_rw from faulting in pages when page faults are not allowed. Signed-off-by: Andreas Gruenbacher --- include/linux/uio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 25d1c24fd829..6350354f97e9 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -35,6 +35,7 @@ struct iov_iter_state { struct iov_iter { u8 iter_type; + bool nofault; bool data_source; size_t iov_offset; size_t count; -- cgit v1.2.3 From 63dfe0709643528290c8a6825f278eda0e3f3c2e Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 18 Sep 2021 18:56:32 +0900 Subject: can: bittiming: allow TDC{V,O} to be zero and add can_tdc_const::tdc{v,o,f}_min ISO 11898-1 specifies in section 11.3.3 "Transmitter delay compensation" that "the configuration range for [the] SSP position shall be at least 0 to 63 minimum time quanta." Because SSP = TDCV + TDCO, it means that we should allow both TDCV and TDCO to hold zero value in order to honor SSP's minimum possible value. However, current implementation assigned special meaning to TDCV and TDCO's zero values: * TDCV = 0 -> TDCV is automatically measured by the transceiver. * TDCO = 0 -> TDC is off. In order to allow for those values to really be zero and to maintain current features, we introduce two new flags: * CAN_CTRLMODE_TDC_AUTO indicates that the controller support automatic measurement of TDCV. * CAN_CTRLMODE_TDC_MANUAL indicates that the controller support manual configuration of TDCV. N.B.: current implementation failed to provide an option for the driver to indicate that only manual mode was supported. TDC is disabled if both CAN_CTRLMODE_TDC_AUTO and CAN_CTRLMODE_TDC_MANUAL flags are off, c.f. the helper function can_tdc_is_enabled() which is also introduced in this patch. Also, this patch adds three fields: tdcv_min, tdco_min and tdcf_min to struct can_tdc_const. While we are not convinced that those three fields could be anything else than zero, we can imagine that some controllers might specify a lower bound on these. Thus, those minimums are really added "just in case". Comments of struct can_tdc and can_tdc_const are updated accordingly. Finally, the changes are applied to the etas_es58x driver. Link: https://lore.kernel.org/all/20210918095637.20108-2-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 64 +++++++++++++++++++++++++++++++------------ include/linux/can/dev.h | 4 +++ 2 files changed, 51 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 9de6e9053e34..9e20260611cc 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -19,6 +19,9 @@ /* Megahertz */ #define CAN_MHZ 1000000UL +#define CAN_CTRLMODE_TDC_MASK \ + (CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_TDC_MANUAL) + /* * struct can_tdc - CAN FD Transmission Delay Compensation parameters * @@ -33,29 +36,43 @@ * * This structure contains the parameters to calculate that SSP. * - * @tdcv: Transmitter Delay Compensation Value. Distance, in time - * quanta, from when the bit is sent on the TX pin to when it is - * received on the RX pin of the transmitter. Possible options: + * -+----------- one bit ----------+-- TX pin + * |<--- Sample Point --->| + * + * --+----------- one bit ----------+-- RX pin + * |<-------- TDCV -------->| + * |<------- TDCO ------->| + * |<----------- Secondary Sample Point ---------->| + * + * @tdcv: Transmitter Delay Compensation Value. The time needed for + * the signal to propagate, i.e. the distance, in time quanta, + * from the start of the bit on the TX pin to when it is received + * on the RX pin. @tdcv depends on the controller modes: + * + * CAN_CTRLMODE_TDC_AUTO is set: The transceiver dynamically + * measures @tdcv for each transmitted CAN FD frame and the + * value provided here should be ignored. * - * 0: automatic mode. The controller dynamically measures @tdcv - * for each transmitted CAN FD frame. + * CAN_CTRLMODE_TDC_MANUAL is set: use the fixed provided @tdcv + * value. * - * Other values: manual mode. Use the fixed provided value. + * N.B. CAN_CTRLMODE_TDC_AUTO and CAN_CTRLMODE_TDC_MANUAL are + * mutually exclusive. Only one can be set at a time. If both + * CAN_TDC_CTRLMODE_AUTO and CAN_TDC_CTRLMODE_MANUAL are unset, + * TDC is disabled and all the values of this structure should be + * ignored. * * @tdco: Transmitter Delay Compensation Offset. Offset value, in time * quanta, defining the distance between the start of the bit * reception on the RX pin of the transceiver and the SSP * position such that SSP = @tdcv + @tdco. * - * If @tdco is zero, then TDC is disabled and both @tdcv and - * @tdcf should be ignored. - * * @tdcf: Transmitter Delay Compensation Filter window. Defines the - * minimum value for the SSP position in time quanta. If SSP is - * less than @tdcf, then no delay compensations occur and the - * normal sampling point is used instead. The feature is enabled - * if and only if @tdcv is set to zero (automatic mode) and @tdcf - * is configured to a value greater than @tdco. + * minimum value for the SSP position in time quanta. If the SSP + * position is less than @tdcf, then no delay compensations occur + * and the normal sampling point is used instead. The feature is + * enabled if and only if @tdcv is set to zero (automatic mode) + * and @tdcf is configured to a value greater than @tdco. */ struct can_tdc { u32 tdcv; @@ -67,19 +84,32 @@ struct can_tdc { * struct can_tdc_const - CAN hardware-dependent constant for * Transmission Delay Compensation * - * @tdcv_max: Transmitter Delay Compensation Value maximum value. - * Should be set to zero if the controller does not support - * manual mode for tdcv. + * @tdcv_min: Transmitter Delay Compensation Value minimum value. If + * the controller does not support manual mode for tdcv + * (c.f. flag CAN_CTRLMODE_TDC_MANUAL) then this value is + * ignored. + * @tdcv_max: Transmitter Delay Compensation Value maximum value. If + * the controller does not support manual mode for tdcv + * (c.f. flag CAN_CTRLMODE_TDC_MANUAL) then this value is + * ignored. + * + * @tdco_min: Transmitter Delay Compensation Offset minimum value. * @tdco_max: Transmitter Delay Compensation Offset maximum value. * Should not be zero. If the controller does not support TDC, * then the pointer to this structure should be NULL. + * + * @tdcf_min: Transmitter Delay Compensation Filter window minimum + * value. If @tdcf_max is zero, this value is ignored. * @tdcf_max: Transmitter Delay Compensation Filter window maximum * value. Should be set to zero if the controller does not * support this feature. */ struct can_tdc_const { + u32 tdcv_min; u32 tdcv_max; + u32 tdco_min; u32 tdco_max; + u32 tdcf_min; u32 tdcf_max; }; diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 2413253e54c7..6dacbbb41e68 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -96,6 +96,10 @@ struct can_priv { #endif }; +static inline bool can_tdc_is_enabled(const struct can_priv *priv) +{ + return !!(priv->ctrlmode & CAN_CTRLMODE_TDC_MASK); +} /* helper to define static CAN controller features at device creation time */ static inline void can_set_static_ctrlmode(struct net_device *dev, -- cgit v1.2.3 From 39f66c9e229797a58a12ea78388cbbad1f81aec9 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 18 Sep 2021 18:56:33 +0900 Subject: can: bittiming: change unit of TDC parameters to clock periods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the current implementation, all Transmission Delay Compensation (TDC) parameters are expressed in time quantum. However, ISO 11898-1 actually specifies that these should be expressed in *minimum* time quantum. Furthermore, the minimum time quantum is specified to be "one node clock period long" (c.f. paragraph 11.3.1.1 "Bit time"). For sake of simplicity, we prefer to use the "clock period" term instead of "minimum time quantum" because we believe that it is more broadly understood. This patch fixes that discrepancy by updating the documentation and the formula for TDCO calculation. N.B. In can_calc_tdco(), the sample point (in time quantum) was calculated using a division, thus introducing a risk of rounding and truncation errors. On top of changing the unit to clock period, we also modified the formula to use only additions. Link: https://lore.kernel.org/all/20210918095637.20108-3-mailhol.vincent@wanadoo.fr Suggested-by: Stefan MĂ€tje Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 9e20260611cc..aebbe65dab7e 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -31,8 +31,8 @@ * * To solve this issue, ISO 11898-1 introduces in section 11.3.3 * "Transmitter delay compensation" a SSP (Secondary Sample Point) - * equal to the distance, in time quanta, from the start of the bit - * time on the TX pin to the actual measurement on the RX pin. + * equal to the distance from the start of the bit time on the TX pin + * to the actual measurement on the RX pin. * * This structure contains the parameters to calculate that SSP. * @@ -44,8 +44,13 @@ * |<------- TDCO ------->| * |<----------- Secondary Sample Point ---------->| * + * To increase precision, contrary to the other bittiming parameters + * which are measured in time quanta, the TDC parameters are measured + * in clock periods (also referred as "minimum time quantum" in ISO + * 11898-1). + * * @tdcv: Transmitter Delay Compensation Value. The time needed for - * the signal to propagate, i.e. the distance, in time quanta, + * the signal to propagate, i.e. the distance, in clock periods, * from the start of the bit on the TX pin to when it is received * on the RX pin. @tdcv depends on the controller modes: * @@ -62,17 +67,18 @@ * TDC is disabled and all the values of this structure should be * ignored. * - * @tdco: Transmitter Delay Compensation Offset. Offset value, in time - * quanta, defining the distance between the start of the bit - * reception on the RX pin of the transceiver and the SSP + * @tdco: Transmitter Delay Compensation Offset. Offset value, in + * clock periods, defining the distance between the start of the + * bit reception on the RX pin of the transceiver and the SSP * position such that SSP = @tdcv + @tdco. * * @tdcf: Transmitter Delay Compensation Filter window. Defines the - * minimum value for the SSP position in time quanta. If the SSP - * position is less than @tdcf, then no delay compensations occur - * and the normal sampling point is used instead. The feature is - * enabled if and only if @tdcv is set to zero (automatic mode) - * and @tdcf is configured to a value greater than @tdco. + * minimum value for the SSP position in clock periods. If the + * SSP position is less than @tdcf, then no delay compensations + * occur and the normal sampling point is used instead. The + * feature is enabled if and only if @tdcv is set to zero + * (automatic mode) and @tdcf is configured to a value greater + * than @tdco. */ struct can_tdc { u32 tdcv; -- cgit v1.2.3 From da45a1e4d7b9d6b5a8231acb812df719fe3228b4 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 18 Sep 2021 18:56:34 +0900 Subject: can: bittiming: change can_calc_tdco()'s prototype to not directly modify priv The function can_calc_tdco() directly retrieves can_priv from the net_device and directly modifies it. This is annoying for the upcoming patch. In drivers/net/can/dev/netlink.c:can_changelink(), the data bittiming are written to a temporary structure and memcpyed to can_priv only after everything succeeded. In the next patch, where we will introduce the netlink interface for TDC parameters, we will add a new TDC block which can potentially fail. For this reason, the data bittiming temporary structure has to be copied after that to-be-introduced TDC block. However, TDC also needs to access data bittiming information. We change the prototype so that the data bittiming structure is passed to can_calc_tdco() as an argument instead of retrieving it from priv. This way can_calc_tdco() can access the data bittiming before it gets memcpyed to priv. Link: https://lore.kernel.org/all/20210918095637.20108-4-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index aebbe65dab7e..20b50baf3a02 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -123,7 +123,9 @@ struct can_tdc_const { int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc); -void can_calc_tdco(struct net_device *dev); +void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, + const struct can_bittiming *dbt, + u32 *ctrlmode, u32 ctrlmode_supported); #else /* !CONFIG_CAN_CALC_BITTIMING */ static inline int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, @@ -133,7 +135,10 @@ can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, return -EINVAL; } -static inline void can_calc_tdco(struct net_device *dev) +static inline void +can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, + const struct can_bittiming *dbt, + u32 *ctrlmode, u32 ctrlmode_supported) { } #endif /* CONFIG_CAN_CALC_BITTIMING */ -- cgit v1.2.3 From e8060f08cd69d1d692cfb9f0a2808477a501f35a Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 18 Sep 2021 18:56:36 +0900 Subject: can: netlink: add can_priv::do_get_auto_tdcv() to retrieve tdcv from device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some CAN device can measure the TDCV (Transmission Delay Compensation Value) automatically for each transmitted CAN frames. A callback function do_get_auto_tdcv() is added to retrieve that value. This function is used only if CAN_CTRLMODE_TDC_AUTO is enabled (if CAN_CTRLMODE_TDC_MANUAL is selected, the TDCV value is provided by the user). If the device does not support reporting of TDCV, do_get_auto_tdcv() should be set to NULL and TDCV will not be reported by the netlink interface. On success, do_get_auto_tdcv() shall return 0. If the value can not be measured by the device, for example because network is down or because no frames were transmitted yet, can_priv::do_get_auto_tdcv() shall return a negative error code (e.g. -EINVAL) to signify that the value is not yet available. In such cases, TDCV is not reported by the netlink interface. Link: https://lore.kernel.org/all/20210918095637.20108-6-mailhol.vincent@wanadoo.fr CC: Stefan MĂ€tje Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 6dacbbb41e68..b4aa0f048cab 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -82,6 +82,7 @@ struct can_priv { enum can_state *state); int (*do_get_berr_counter)(const struct net_device *dev, struct can_berr_counter *bec); + int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); unsigned int echo_skb_max; struct sk_buff **echo_skb; -- cgit v1.2.3 From fa759a9395ea81c17db613dde43c46f0607df7e7 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 18 Sep 2021 18:56:37 +0900 Subject: can: dev: add can_tdc_get_relative_tdco() helper function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit struct can_tdc::tdco represents the absolute offset from TDCV. Some controllers use instead an offset relative to the Sample Point (SP) such that: | SSP = TDCV + absolute TDCO | = TDCV + SP + relative TDCO Consequently: | relative TDCO = absolute TDCO - SP The function can_tdc_get_relative_tdco() allow to retrieve this relative TDCO value. Link: https://lore.kernel.org/all/20210918095637.20108-7-mailhol.vincent@wanadoo.fr CC: Stefan MĂ€tje Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index b4aa0f048cab..45f19d9db5ca 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -102,6 +102,35 @@ static inline bool can_tdc_is_enabled(const struct can_priv *priv) return !!(priv->ctrlmode & CAN_CTRLMODE_TDC_MASK); } +/* + * can_get_relative_tdco() - TDCO relative to the sample point + * + * struct can_tdc::tdco represents the absolute offset from TDCV. Some + * controllers use instead an offset relative to the Sample Point (SP) + * such that: + * + * SSP = TDCV + absolute TDCO + * = TDCV + SP + relative TDCO + * + * -+----------- one bit ----------+-- TX pin + * |<--- Sample Point --->| + * + * --+----------- one bit ----------+-- RX pin + * |<-------- TDCV -------->| + * |<------------------------>| absolute TDCO + * |<--- Sample Point --->| + * | |<->| relative TDCO + * |<------------- Secondary Sample Point ------------>| + */ +static inline s32 can_get_relative_tdco(const struct can_priv *priv) +{ + const struct can_bittiming *dbt = &priv->data_bittiming; + s32 sample_point_in_tc = (CAN_SYNC_SEG + dbt->prop_seg + + dbt->phase_seg1) * dbt->brp; + + return (s32)priv->tdc.tdco - sample_point_in_tc; +} + /* helper to define static CAN controller features at device creation time */ static inline void can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) -- cgit v1.2.3 From b3b180e735409ca0c76642014304b59482e0e653 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 20 Sep 2021 14:20:07 +0200 Subject: dmaengine: remove debugfs #ifdef The ptdma driver has added debugfs support, but this fails to build when debugfs is disabled: drivers/dma/ptdma/ptdma-debugfs.c: In function 'ptdma_debugfs_setup': drivers/dma/ptdma/ptdma-debugfs.c:93:54: error: 'struct dma_device' has no member named 'dbg_dev_root' 93 | debugfs_create_file("info", 0400, pt->dma_dev.dbg_dev_root, pt, | ^ drivers/dma/ptdma/ptdma-debugfs.c:96:55: error: 'struct dma_device' has no member named 'dbg_dev_root' 96 | debugfs_create_file("stats", 0400, pt->dma_dev.dbg_dev_root, pt, | ^ drivers/dma/ptdma/ptdma-debugfs.c:102:52: error: 'struct dma_device' has no member named 'dbg_dev_root' 102 | debugfs_create_dir("q", pt->dma_dev.dbg_dev_root); | ^ Remove the #ifdef in the header, as this only saves a few bytes, but would require ugly #ifdefs in each driver using it. Simplify the other user while we're at it. Fixes: e2fb2e2a33fa ("dmaengine: ptdma: Add debugfs entries for PTDMA") Fixes: 26cf132de6f7 ("dmaengine: Create debug directories for DMA devices") Signed-off-by: Arnd Bergmann Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/20210920122017.205975-1-arnd@kernel.org Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index e5c2c9e71bf1..9000f3ffce8b 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -944,10 +944,8 @@ struct dma_device { void (*device_issue_pending)(struct dma_chan *chan); void (*device_release)(struct dma_device *dev); /* debugfs support */ -#ifdef CONFIG_DEBUG_FS void (*dbg_summary_show)(struct seq_file *s, struct dma_device *dev); struct dentry *dbg_dev_root; -#endif }; static inline int dmaengine_slave_config(struct dma_chan *chan, -- cgit v1.2.3 From 1ba5478270a5c3a02b08052a3c003c282f2db94a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Oct 2021 17:49:21 +0200 Subject: irqchip: Fix compile-testing without CONFIG_OF Drivers using the new IRQCHIP_PLATFORM_DRIVER_BEGIN helper fail to link when compile-testing without CONFIG_OF, as that means CONFIG_IRQCHIP is disabled as well: ld.lld: error: undefined symbol: platform_irqchip_probe >>> referenced by irq-meson-gpio.c >>> irqchip/irq-meson-gpio.o:(meson_gpio_intc_driver) in archive drivers/built-in.a >>> referenced by irq-mchp-eic.c >>> irqchip/irq-mchp-eic.o:(mchp_eic_driver) in archive drivers/built-in.a As the drivers are not actually used in this case, just making the reference to this symbol conditional helps avoid the link failure. Fixes: f8410e626569 ("irqchip: Add IRQCHIP_PLATFORM_DRIVER_BEGIN/END and IRQCHIP_MATCH helper macros") Signed-off-by: Arnd Bergmann Reviewed-by: Florian Fainelli Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20211022154927.920491-1-arnd@kernel.org --- include/linux/irqchip.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index 67351aac65ef..29dbe675fc7a 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -39,8 +39,9 @@ static const struct of_device_id drv_name##_irqchip_match_table[] = { {}, \ }; \ MODULE_DEVICE_TABLE(of, drv_name##_irqchip_match_table); \ -static struct platform_driver drv_name##_driver = { \ - .probe = platform_irqchip_probe, \ +static struct platform_driver drv_name##_driver = { \ + .probe = IS_ENABLED(CONFIG_IRQCHIP) ? \ + platform_irqchip_probe : NULL, \ .driver = { \ .name = #drv_name, \ .owner = THIS_MODULE, \ -- cgit v1.2.3 From f2739ca15c414ebad88f4333e3186fd4144c1753 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 22 Oct 2021 11:46:42 -0500 Subject: x86/of: Kill unused early_init_dt_scan_chosen_arch() There are no callers for early_init_dt_scan_chosen_arch(), so remove it. Signed-off-by: Rob Herring Signed-off-by: Borislav Petkov Reviewed-by: Frank Rowand Link: https://lkml.kernel.org/r/20211022164642.2815706-1-robh@kernel.org --- include/linux/of_fdt.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index cf6a65b94d40..cf48983d3c86 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -65,7 +65,6 @@ extern int early_init_dt_scan_memory(unsigned long node, const char *uname, extern int early_init_dt_scan_chosen_stdout(void); extern void early_init_fdt_scan_reserved_mem(void); extern void early_init_fdt_reserve_self(void); -extern void __init early_init_dt_scan_chosen_arch(unsigned long node); extern void early_init_dt_add_memory_arch(u64 base, u64 size); extern u64 dt_mem_next_cell(int s, const __be32 **cellp); -- cgit v1.2.3 From a1b09501971435ef213251891753afb0d7f3d27a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 20 Oct 2021 11:24:06 +0100 Subject: irq: add generic_handle_arch_irq() Several architectures select GENERIC_IRQ_MULTI_HANDLER and branch to handle_arch_irq() without performing any entry accounting. Add a generic wrapper to handle the common irqentry work when invoking handle_arch_irq(). Where an architecture needs to perform some entry accounting itself, it will need to invoke handle_arch_irq() itself. In subsequent patches it will become the responsibilty of the entry code to set the irq regs when entering an IRQ (rather than deferring this to an irqchip handler), so generic_handle_arch_irq() is made to set the irq regs now. This can be redundant in some cases, but is never harmful as saving/restoring the old regs nests safely. Signed-off-by: Mark Rutland Reviewed-by: Marc Zyngier Reviewed-by: Guo Ren Cc: Thomas Gleixner --- include/linux/irq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index c8293c817646..988c225eef2d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1261,6 +1261,7 @@ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)); * top-level IRQ handler. */ extern void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; +asmlinkage void generic_handle_arch_irq(struct pt_regs *regs); #else #ifndef set_handle_irq #define set_handle_irq(handle_irq) \ -- cgit v1.2.3 From 63c67f526db86d3102a77437a510c949f6debb08 Mon Sep 17 00:00:00 2001 From: Luo Jie Date: Sun, 24 Oct 2021 16:27:34 +0800 Subject: net: phy: add genphy_c45_fast_retrain Add generic fast retrain auto-negotiation function for C45 PHYs. Signed-off-by: Luo Jie Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 736e1d1a47c4..04e90423fa88 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1584,6 +1584,7 @@ int genphy_c45_config_aneg(struct phy_device *phydev); int genphy_c45_loopback(struct phy_device *phydev, bool enable); int genphy_c45_pma_resume(struct phy_device *phydev); int genphy_c45_pma_suspend(struct phy_device *phydev); +int genphy_c45_fast_retrain(struct phy_device *phydev, bool enable); /* Generic C45 PHY driver */ extern struct phy_driver genphy_c45_driver; -- cgit v1.2.3 From 6b19b766e8f077f29cdb47da5003469a85bbfb9c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Oct 2021 09:22:35 -0600 Subject: fs: get rid of the res2 iocb->ki_complete argument The second argument was only used by the USB gadget code, yet everyone pays the overhead of passing a zero to be passed into aio, where it ends up being part of the aio res2 value. Now that everybody is passing in zero, kill off the extra argument. Reviewed-by: Darrick J. Wong Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 31029a91f440..0dcb9020a7b3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -330,7 +330,7 @@ struct kiocb { randomized_struct_fields_start loff_t ki_pos; - void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); + void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; u16 ki_hint; -- cgit v1.2.3 From cb464ba53c0cb497dcb4a3daaf4fad4b75291863 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 11 Oct 2021 13:14:28 +0300 Subject: net/mlx5: Extend health buffer dump Enhance health buffer to include: - assert_var5: expose the 6'th assert variable. - time: error's time-stamp in seconds (epoch time). - rfr: Recovery Flow Requiered. When set, indicates that the error cannot be recovered without flow involving reset. - severity: error's severity value, ranging from emergency to debug. Expose them in the health buffer dump (dmesg and devlink fw reporter). Health buffer in dmesg: mlx5_core 0000:08:00.0: print_health_info:425:(pid 912): Health issue observed, firmware internal error, severity(3) ERROR: mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[0] 0x08040700 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[1] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[2] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[3] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[4] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[5] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:432:(pid 912): assert_exit_ptr 0x00aaf800 mlx5_core 0000:08:00.0: print_health_info:434:(pid 912): assert_callra 0x00aaf70c mlx5_core 0000:08:00.0: print_health_info:436:(pid 912): fw_ver 16.32.492 mlx5_core 0000:08:00.0: print_health_info:437:(pid 912): time 1634819758 mlx5_core 0000:08:00.0: print_health_info:438:(pid 912): hw_id 0x0000020d mlx5_core 0000:08:00.0: print_health_info:439:(pid 912): rfr 0 mlx5_core 0000:08:00.0: print_health_info:440:(pid 912): severity 3 (ERROR) mlx5_core 0000:08:00.0: print_health_info:441:(pid 912): irisc_index 9 mlx5_core 0000:08:00.0: print_health_info:442:(pid 912): synd 0x1: firmware internal error mlx5_core 0000:08:00.0: print_health_info:444:(pid 912): ext_synd 0x802b mlx5_core 0000:08:00.0: print_health_info:445:(pid 912): raw fw_ver 0x102001ec Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 14 ++++++++------ include/linux/mlx5/mlx5_ifc.h | 10 ++++++++-- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 347167c18802..f8a0bbb42c3b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -541,19 +541,21 @@ struct mlx5_cmd_layout { u8 status_own; }; -enum mlx5_fatal_assert_bit_offsets { - MLX5_RFR_OFFSET = 31, +enum mlx5_rfr_severity_bit_offsets { + MLX5_RFR_BIT_OFFSET = 0x7, }; struct health_buffer { - __be32 assert_var[5]; - __be32 rsvd0[3]; + __be32 assert_var[6]; + __be32 rsvd0[2]; __be32 assert_exit_ptr; __be32 assert_callra; - __be32 rsvd1[2]; + __be32 rsvd1[1]; + __be32 time; __be32 fw_ver; __be32 hw_id; - __be32 rfr; + u8 rfr_severity; + u8 rsvd2[3]; u8 irisc_index; u8 synd; __be16 ext_synd; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 09e43019d877..6d292b5b8992 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4149,13 +4149,19 @@ struct mlx5_ifc_health_buffer_bits { u8 assert_callra[0x20]; - u8 reserved_at_140[0x40]; + u8 reserved_at_140[0x20]; + + u8 time[0x20]; u8 fw_version[0x20]; u8 hw_id[0x20]; - u8 reserved_at_1c0[0x20]; + u8 rfr[0x1]; + u8 reserved_at_1c1[0x3]; + u8 valid[0x1]; + u8 severity[0x3]; + u8 reserved_at_1c8[0x18]; u8 irisc_index[0x8]; u8 synd[0x8]; -- cgit v1.2.3 From 5a1023deeed02a2078bcc11eec1c4be31e85892d Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 13 Oct 2021 09:45:22 +0300 Subject: net/mlx5: Add periodic update of host time to firmware Firmware logs its asserts also to non-volatile memory. In order to reduce drift between the NIC and the host, the driver sets the host epoch-time to the firmware every hour. Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 12 ++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 3f4c0f2314a5..f617dfbcd9fd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -134,6 +134,7 @@ enum { MLX5_REG_MCIA = 0x9014, MLX5_REG_MFRL = 0x9028, MLX5_REG_MLCR = 0x902b, + MLX5_REG_MRTC = 0x902d, MLX5_REG_MTRC_CAP = 0x9040, MLX5_REG_MTRC_CONF = 0x9041, MLX5_REG_MTRC_STDB = 0x9042, @@ -440,6 +441,7 @@ struct mlx5_core_health { struct work_struct report_work; struct devlink_health_reporter *fw_reporter; struct devlink_health_reporter *fw_fatal_reporter; + struct delayed_work update_fw_log_ts_work; }; struct mlx5_qp_table { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6d292b5b8992..746381eccccf 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10358,6 +10358,17 @@ struct mlx5_ifc_pddr_reg_bits { union mlx5_ifc_pddr_reg_page_data_auto_bits page_data; }; +struct mlx5_ifc_mrtc_reg_bits { + u8 time_synced[0x1]; + u8 reserved_at_1[0x1f]; + + u8 reserved_at_20[0x20]; + + u8 time_h[0x20]; + + u8 time_l[0x20]; +}; + union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_bufferx_reg_bits bufferx_reg; struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout; @@ -10419,6 +10430,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mirc_reg_bits mirc_reg; struct mlx5_ifc_mfrl_reg_bits mfrl_reg; struct mlx5_ifc_mtutc_reg_bits mtutc_reg; + struct mlx5_ifc_mrtc_reg_bits mrtc_reg; u8 reserved_at_0[0x60e0]; }; -- cgit v1.2.3 From 46ae40b94d8826591472798114a723cc7feac7a7 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 12 Aug 2021 11:53:34 +0300 Subject: net/mlx5: Let user configure io_eq_size param Currently, each I/O EQ is taking 128KB of memory. This size is not needed in all use cases, and is critical with large scale. Hence, allow user to configure the size of I/O EQs. For example, to reduce I/O EQ size to 64, execute: $ devlink resource set pci/0000:00:0b.0 path /io_eq_size/ size 64 $ devlink dev reload pci/0000:00:0b.0 Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f617dfbcd9fd..47c07f95bbe1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -797,10 +797,6 @@ struct mlx5_db { int index; }; -enum { - MLX5_COMP_EQ_SIZE = 1024, -}; - enum { MLX5_PTYS_IB = 1 << 0, MLX5_PTYS_EN = 1 << 2, -- cgit v1.2.3 From a6cb08daa3b459e3dab1d98c67cb8c931f4d81a5 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 13 Oct 2021 09:57:54 +0300 Subject: net/mlx5: Let user configure event_eq_size param Event EQ is an EQ which received the notification of almost all the events generated by the NIC. Currently, each event EQ is taking 512KB of memory. This size is not needed in most use cases, and is critical with large scale. Hence, allow user to configure the size of the event EQ. For example to reduce event EQ size to 64, execute:: $ devlink resource set pci/0000:00:0b.0 path /event_eq_size/ size 64 $ devlink dev reload pci/0000:00:0b.0 Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h index ea3ff5a8ced3..11161e427608 100644 --- a/include/linux/mlx5/eq.h +++ b/include/linux/mlx5/eq.h @@ -5,7 +5,6 @@ #define MLX5_CORE_EQ_H #define MLX5_NUM_CMD_EQE (32) -#define MLX5_NUM_ASYNC_EQE (0x1000) #define MLX5_NUM_SPARE_EQE (0x80) struct mlx5_eq; -- cgit v1.2.3 From 554604061979d656bbbec50f101526349cd5aa5f Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 16 Aug 2021 08:41:08 +0300 Subject: net/mlx5: Let user configure max_macs param Currently, max_macs is taking 70Kbytes of memory per function. This size is not needed in all use cases, and is critical with large scale. Hence, allow user to configure the number of max_macs. For example, to reduce the number of max_macs to 1, execute:: $ devlink dev param set pci/0000:00:0b.0 name max_macs value 1 \ cmode driverinit $ devlink dev reload pci/0000:00:0b.0 Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 746381eccccf..97465d00de9d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1603,7 +1603,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 ext_stride_num_range[0x1]; u8 roce_rw_supported[0x1]; - u8 reserved_at_3a2[0x1]; + u8 log_max_current_uc_list_wr_supported[0x1]; u8 log_max_stride_sz_rq[0x5]; u8 reserved_at_3a8[0x3]; u8 log_min_stride_sz_rq[0x5]; -- cgit v1.2.3 From 95cadae320be46583078690ac89ffe63c95cc9d2 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 25 Oct 2021 17:05:28 -0400 Subject: fortify: strlen: Avoid shadowing previous locals The __compiletime_strlen() macro expansion will shadow p_size and p_len local variables. No callers currently use any of the shadowed names for their "p" variable, so there are no code generation problems. Add "__" prefixes to variable definitions __compiletime_strlen() to avoid new W=2 warnings: ./include/linux/fortify-string.h: In function 'strnlen': ./include/linux/fortify-string.h:17:9: warning: declaration of 'p_size' shadows a previous local [-Wshadow] 17 | size_t p_size = __builtin_object_size(p, 1); \ | ^~~~~~ ./include/linux/fortify-string.h:77:17: note: in expansion of macro '__compiletime_strlen' 77 | size_t p_len = __compiletime_strlen(p); | ^~~~~~~~~~~~~~~~~~~~ ./include/linux/fortify-string.h:76:9: note: shadowed declaration is here 76 | size_t p_size = __builtin_object_size(p, 1); | ^~~~~~ Signed-off-by: Qian Cai Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20211025210528.261643-1-quic_qiancai@quicinc.com --- include/linux/fortify-string.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index fdb0a74c9ca2..a6cd6815f249 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -10,18 +10,18 @@ void __read_overflow(void) __compiletime_error("detected read beyond size of obj void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)"); void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)"); -#define __compiletime_strlen(p) \ -({ \ - unsigned char *__p = (unsigned char *)(p); \ - size_t ret = (size_t)-1; \ - size_t p_size = __builtin_object_size(p, 1); \ - if (p_size != (size_t)-1) { \ - size_t p_len = p_size - 1; \ - if (__builtin_constant_p(__p[p_len]) && \ - __p[p_len] == '\0') \ - ret = __builtin_strlen(__p); \ - } \ - ret; \ +#define __compiletime_strlen(p) \ +({ \ + unsigned char *__p = (unsigned char *)(p); \ + size_t __ret = (size_t)-1; \ + size_t __p_size = __builtin_object_size(p, 1); \ + if (__p_size != (size_t)-1) { \ + size_t __p_len = __p_size - 1; \ + if (__builtin_constant_p(__p[__p_len]) && \ + __p[__p_len] == '\0') \ + __ret = __builtin_strlen(__p); \ + } \ + __ret; \ }) #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) -- cgit v1.2.3 From ef57c1610dd8fba5031bf71e0db73356190de151 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Oct 2021 09:48:17 -0700 Subject: ipv6: move inet6_sk(sk)->rx_dst_cookie to sk->sk_rx_dst_cookie Increase cache locality by moving rx_dst_coookie next to sk->sk_rx_dst This removes one or two cache line misses in IPv6 early demux (TCP/UDP) Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ef4a69865737..c383630d3f06 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -282,7 +282,6 @@ struct ipv6_pinfo { __be32 rcv_flowinfo; __u32 dst_cookie; - __u32 rx_dst_cookie; struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; -- cgit v1.2.3 From 79ca6f74dae067681a779fd573c2eb59649989bc Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Wed, 8 Sep 2021 02:26:06 -0700 Subject: tpm: fix Atmel TPM crash caused by too frequent queries The Atmel TPM 1.2 chips crash with error `tpm_try_transmit: send(): error -62` since kernel 4.14. It is observed from the kernel log after running `tpm_sealdata -z`. The error thrown from the command is as follows ``` $ tpm_sealdata -z Tspi_Key_LoadKey failed: 0x00001087 - layer=tddl, code=0087 (135), I/O error ``` The issue was reproduced with the following Atmel TPM chip: ``` $ tpm_version T0 TPM 1.2 Version Info: Chip Version: 1.2.66.1 Spec Level: 2 Errata Revision: 3 TPM Vendor ID: ATML TPM Version: 01010000 Manufacturer Info: 41544d4c ``` The root cause of the issue is due to the TPM calls to msleep() were replaced with usleep_range() [1], which reduces the actual timeout. Via experiments, it is observed that the original msleep(5) actually sleeps for 15ms. Because of a known timeout issue in Atmel TPM 1.2 chip, the shorter timeout than 15ms can cause the error described above. A few further changes in kernel 4.16 [2] and 4.18 [3, 4] further reduced the timeout to less than 1ms. With experiments, the problematic timeout in the latest kernel is the one for `wait_for_tpm_stat`. To fix it, the patch reverts the timeout of `wait_for_tpm_stat` to 15ms for all Atmel TPM 1.2 chips, but leave it untouched for Ateml TPM 2.0 chip, and chips from other vendors. As explained above, the chosen 15ms timeout is the actual timeout before this issue introduced, thus the old value is used here. Particularly, TPM_ATML_TIMEOUT_WAIT_STAT_MIN is set to 14700us, TPM_ATML_TIMEOUT_WAIT_STAT_MIN is set to 15000us according to the existing TPM_TIMEOUT_RANGE_US (300us). The fixed has been tested in the system with the affected Atmel chip with no issues observed after boot up. References: [1] 9f3fc7bcddcb tpm: replace msleep() with usleep_range() in TPM 1.2/2.0 generic drivers [2] cf151a9a44d5 tpm: reduce tpm polling delay in tpm_tis_core [3] 59f5a6b07f64 tpm: reduce poll sleep time in tpm_transmit() [4] 424eaf910c32 tpm: reduce polling time to usecs for even finer granularity Fixes: 9f3fc7bcddcb ("tpm: replace msleep() with usleep_range() in TPM 1.2/2.0 generic drivers") Link: https://patchwork.kernel.org/project/linux-integrity/patch/20200926223150.109645-1-hao.wu@rubrik.com/ Signed-off-by: Hao Wu Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index aa11fe323c56..12d827734686 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -269,6 +269,7 @@ enum tpm2_cc_attrs { #define TPM_VID_INTEL 0x8086 #define TPM_VID_WINBOND 0x1050 #define TPM_VID_STM 0x104A +#define TPM_VID_ATML 0x1114 enum tpm_chip_flags { TPM_CHIP_FLAG_TPM2 = BIT(1), -- cgit v1.2.3 From 1bdda24c4af64cd2d65dec5192ab624c5fee7ca0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Oct 2021 15:55:05 -0700 Subject: signal: Add an optional check for altstack size New x86 FPU features will be very large, requiring ~10k of stack in signal handlers. These new features require a new approach called "dynamic features". The kernel currently tries to ensure that altstacks are reasonably sized. Right now, on x86, sys_sigaltstack() requires a size of >=2k. However, that 2k is a constant. Simply raising that 2k requirement to >10k for the new features would break existing apps which have a compiled-in size of 2k. Instead of universally enforcing a larger stack, prohibit a process from using dynamic features without properly-sized altstacks. This must be enforced in two places: * A dynamic feature can not be enabled without an large-enough altstack for each process thread. * Once a dynamic feature is enabled, any request to install a too-small altstack will be rejected The dynamic feature enabling code must examine each thread in a process to ensure that the altstacks are large enough. Add a new lock (sigaltstack_lock()) to ensure that threads can not race and change their altstack after being examined. Add the infrastructure in form of a config option and provide empty stubs for architectures which do not need dynamic altstack size checks. This implementation will be fleshed out for x86 in a future patch called x86/arch_prctl: Add controls for dynamic XSTATE components [dhansen: commit message. ] Signed-off-by: Thomas Gleixner Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211021225527.10184-2-chang.seok.bae@intel.com --- include/linux/signal.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/signal.h b/include/linux/signal.h index 3f96a6374e4f..7d34105e20c6 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -464,6 +464,12 @@ int __save_altstack(stack_t __user *, unsigned long); unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \ } while (0); +#ifdef CONFIG_DYNAMIC_SIGFRAME +bool sigaltstack_size_valid(size_t ss_size); +#else +static inline bool sigaltstack_size_valid(size_t size) { return true; } +#endif /* !CONFIG_DYNAMIC_SIGFRAME */ + #ifdef CONFIG_PROC_FS struct seq_file; extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); -- cgit v1.2.3 From 0953fb263714e1c8c1c3d395036d9a14310081dd Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 20 Oct 2021 20:23:09 +0100 Subject: irq: remove handle_domain_{irq,nmi}() Now that entry code handles IRQ entry (including setting the IRQ regs) before calling irqchip code, irqchip code can safely call generic_handle_domain_irq(), and there's no functional reason for it to call handle_domain_irq(). Let's cement this split of responsibility and remove handle_domain_irq() entirely, updating irqchip drivers to call generic_handle_domain_irq(). For consistency, handle_domain_nmi() is similarly removed and replaced with a generic_handle_domain_nmi() function which also does not perform any entry logic. Previously handle_domain_{irq,nmi}() had a WARN_ON() which would fire when they were called in an inappropriate context. So that we can identify similar issues going forward, similar WARN_ON_ONCE() logic is added to the generic_handle_*() functions, and comments are updated for clarity and consistency. Signed-off-by: Mark Rutland Reviewed-by: Marc Zyngier Cc: Thomas Gleixner --- include/linux/irqdesc.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 59aea39785bf..93d270ca0c56 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -168,14 +168,7 @@ int generic_handle_irq(unsigned int irq); * conversion failed. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq); - -#ifdef CONFIG_HANDLE_DOMAIN_IRQ -int handle_domain_irq(struct irq_domain *domain, - unsigned int hwirq, struct pt_regs *regs); - -int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, - struct pt_regs *regs); -#endif +int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq); #endif /* Test to see if a driver has successfully requested an irq */ -- cgit v1.2.3 From 8d15a7295d33538954df2bca6a4011e4311a9cc2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 21 Oct 2021 18:04:14 +0100 Subject: genirq: Hide irq_cpu_{on,off}line() behind a deprecated option irq_cpu_{on,off}line() are now only used by the Octeon platform. Make their use conditional on this plaform being enabled, and otherwise hidden away. Signed-off-by: Marc Zyngier Reviewed-by: Florian Fainelli Tested-by: Serge Semin Link: https://lore.kernel.org/r/20211021170414.3341522-4-maz@kernel.org --- include/linux/irq.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index c8293c817646..0c746d325206 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -524,9 +524,10 @@ struct irq_chip { void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); +#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE void (*irq_cpu_online)(struct irq_data *data); void (*irq_cpu_offline)(struct irq_data *data); - +#endif void (*irq_suspend)(struct irq_data *data); void (*irq_resume)(struct irq_data *data); void (*irq_pm_shutdown)(struct irq_data *data); @@ -606,8 +607,10 @@ struct irqaction; extern int setup_percpu_irq(unsigned int irq, struct irqaction *new); extern void remove_percpu_irq(unsigned int irq, struct irqaction *act); +#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE extern void irq_cpu_online(void); extern void irq_cpu_offline(void); +#endif extern int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask, bool force); extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info); -- cgit v1.2.3 From 99ce45d5e7dbde399997a630f45ac9f654fa4bcc Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Tue, 26 Oct 2021 09:57:28 +0800 Subject: mctp: Implement extended addressing This change allows an extended address struct - struct sockaddr_mctp_ext - to be passed to sendmsg/recvmsg. This allows userspace to specify output ifindex and physical address information (for sendmsg) or receive the input ifindex/physaddr for incoming messages (for recvmsg). This is typically used by userspace for MCTP address discovery and assignment operations. The extended addressing facility is conditional on a new sockopt: MCTP_OPT_ADDR_EXT; userspace must explicitly enable addressing before the kernel will consume/populate the extended address data. Includes a fix for an uninitialised var: Reported-by: kernel test robot Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/linux/socket.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 7612d760b6a9..8ef26d89ef49 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -365,6 +365,7 @@ struct ucred { #define SOL_TLS 282 #define SOL_XDP 283 #define SOL_MPTCP 284 +#define SOL_MCTP 285 /* IPX options */ #define IPX_TYPE 1 -- cgit v1.2.3 From 8e20f591f204f8db7f1182918f8e2285d3f589e0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 26 Oct 2021 11:06:01 +0100 Subject: net: phy: add phy_interface_t bitmap support Add support for a bitmap for phy interface modes, which includes: - a macro to declare the interface bitmap - an inline helper to zero the interface bitmap - an inline helper to detect an empty interface bitmap - inline helpers to do a bitwise AND and OR operations on two interface bitmaps Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phy.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 04e90423fa88..96e43fbb2dd8 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -155,6 +155,40 @@ typedef enum { PHY_INTERFACE_MODE_MAX, } phy_interface_t; +/* PHY interface mode bitmap handling */ +#define DECLARE_PHY_INTERFACE_MASK(name) \ + DECLARE_BITMAP(name, PHY_INTERFACE_MODE_MAX) + +static inline void phy_interface_zero(unsigned long *intf) +{ + bitmap_zero(intf, PHY_INTERFACE_MODE_MAX); +} + +static inline bool phy_interface_empty(const unsigned long *intf) +{ + return bitmap_empty(intf, PHY_INTERFACE_MODE_MAX); +} + +static inline void phy_interface_and(unsigned long *dst, const unsigned long *a, + const unsigned long *b) +{ + bitmap_and(dst, a, b, PHY_INTERFACE_MODE_MAX); +} + +static inline void phy_interface_or(unsigned long *dst, const unsigned long *a, + const unsigned long *b) +{ + bitmap_or(dst, a, b, PHY_INTERFACE_MODE_MAX); +} + +static inline void phy_interface_set_rgmii(unsigned long *intf) +{ + __set_bit(PHY_INTERFACE_MODE_RGMII, intf); + __set_bit(PHY_INTERFACE_MODE_RGMII_ID, intf); + __set_bit(PHY_INTERFACE_MODE_RGMII_RXID, intf); + __set_bit(PHY_INTERFACE_MODE_RGMII_TXID, intf); +} + /* * phy_supported_speeds - return all speeds currently supported by a PHY device */ -- cgit v1.2.3 From 38c310eb46f5f80213a92093af11af270c209a76 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 26 Oct 2021 11:06:06 +0100 Subject: net: phylink: add MAC phy_interface_t bitmap Add a phy_interface_t bitmap so the MAC driver can specifiy which PHY interface modes it supports. Signed-off-by: Russell King Signed-off-by: David S. Miller --- include/linux/phylink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index f7b5ed06a815..bc4b866cd99b 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -76,6 +76,7 @@ struct phylink_config { bool ovr_an_inband; void (*get_fixed_state)(struct phylink_config *config, struct phylink_link_state *state); + DECLARE_PHY_INTERFACE_MASK(supported_interfaces); }; /** -- cgit v1.2.3 From d25f3a74f30aace819163dfa54f2a4b8ca1dc932 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 26 Oct 2021 11:06:11 +0100 Subject: net: phylink: use supported_interfaces for phylink validation If the network device supplies a supported interface bitmap, we can use that during phylink's validation to simplify MAC drivers in two ways by using the supported_interfaces bitmap to: 1. reject unsupported interfaces before calling into the MAC driver. 2. generate the set of all supported link modes across all supported interfaces (used mainly for SFP, but also some 10G PHYs.) Suggested-by: Sean Anderson Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index bc4b866cd99b..f037470b6fb3 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -67,6 +67,8 @@ enum phylink_op_type { * @ovr_an_inband: if true, override PCS to MLO_AN_INBAND * @get_fixed_state: callback to execute to determine the fixed link state, * if MAC link is at %MLO_AN_FIXED mode. + * @supported_interfaces: bitmap describing which PHY_INTERFACE_MODE_xxx + * are supported by the MAC/PCS. */ struct phylink_config { struct device *dev; @@ -134,8 +136,14 @@ struct phylink_mac_ops { * based on @state->advertising and/or @state->speed and update * @state->interface accordingly. See phylink_helper_basex_speed(). * - * When @state->interface is %PHY_INTERFACE_MODE_NA, phylink expects the - * MAC driver to return all supported link modes. + * When @config->supported_interfaces has been set, phylink will iterate + * over the supported interfaces to determine the full capability of the + * MAC. The validation function must not print errors if @state->interface + * is set to an unexpected value. + * + * When @config->supported_interfaces is empty, phylink will call this + * function with @state->interface set to %PHY_INTERFACE_MODE_NA, and + * expects the MAC driver to return all supported link modes. * * If the @state->interface mode is not supported, then the @supported * mask must be cleared. -- cgit v1.2.3 From e60feb445fce9e51c1558a6aa7faf9dd5ded533b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Oct 2021 13:11:00 -0400 Subject: fs: export an inode_update_time helper If you already have an inode and need to update the time on the inode there is no way to do this properly. Export this helper to allow file systems to update time on the inode so the appropriate handler is called, either ->update_time or generic_update_time. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e7a633353fd2..56eba723477e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2498,6 +2498,8 @@ enum file_time_flags { extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); +int inode_update_time(struct inode *inode, struct timespec64 *time, int flags); + static inline void file_accessed(struct file *file) { if (!(file->f_flags & O_NOATIME)) -- cgit v1.2.3 From 6b3671746a8a3aa05316b829e1357060f35009c1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 Oct 2021 08:29:39 -0700 Subject: net/mlx5: remove the recent devlink params revert commit 46ae40b94d88 ("net/mlx5: Let user configure io_eq_size param") revert commit a6cb08daa3b4 ("net/mlx5: Let user configure event_eq_size param") revert commit 554604061979 ("net/mlx5: Let user configure max_macs param") The EQE parameters are applicable to more drivers, they should be configured via standard API, probably ethtool. Example of another driver needing something similar: https://lore.kernel.org/all/1633454136-14679-3-git-send-email-sbhatta@marvell.com/ The last param for "max_macs" is probably fine but the documentation is severely lacking. The meaning and implications for changing the param need to be stated. Link: https://lore.kernel.org/r/20211026152939.3125950-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 4 ++++ include/linux/mlx5/eq.h | 1 + include/linux/mlx5/mlx5_ifc.h | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 47c07f95bbe1..f617dfbcd9fd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -797,6 +797,10 @@ struct mlx5_db { int index; }; +enum { + MLX5_COMP_EQ_SIZE = 1024, +}; + enum { MLX5_PTYS_IB = 1 << 0, MLX5_PTYS_EN = 1 << 2, diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h index 11161e427608..ea3ff5a8ced3 100644 --- a/include/linux/mlx5/eq.h +++ b/include/linux/mlx5/eq.h @@ -5,6 +5,7 @@ #define MLX5_CORE_EQ_H #define MLX5_NUM_CMD_EQE (32) +#define MLX5_NUM_ASYNC_EQE (0x1000) #define MLX5_NUM_SPARE_EQE (0x80) struct mlx5_eq; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 97465d00de9d..746381eccccf 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1603,7 +1603,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 ext_stride_num_range[0x1]; u8 roce_rw_supported[0x1]; - u8 log_max_current_uc_list_wr_supported[0x1]; + u8 reserved_at_3a2[0x1]; u8 log_max_stride_sz_rq[0x5]; u8 reserved_at_3a8[0x3]; u8 log_min_stride_sz_rq[0x5]; -- cgit v1.2.3 From fb4e0a5e73d4bb5ab69b7905abd2ec3b580e9b59 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 8 Oct 2021 13:33:04 -0700 Subject: skmsg: Extract and reuse sk_msg_is_readable() tcp_bpf_sock_is_readable() is pretty much generic, we can extract it and reuse it for non-TCP sockets. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211008203306.37525-3-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 14ab0c0bc924..1ce9a9eb223b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -128,6 +128,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); +bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { -- cgit v1.2.3 From 99d0a3831e3500d945162cdb2310e3a5fce90b60 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Oct 2021 08:46:10 -1000 Subject: bpf: Move BPF_MAP_TYPE for INODE_STORAGE and TASK_STORAGE outside of CONFIG_NET bpf_types.h has BPF_MAP_TYPE_INODE_STORAGE and BPF_MAP_TYPE_TASK_STORAGE declared inside #ifdef CONFIG_NET although they are built regardless of CONFIG_NET. So, when CONFIG_BPF_SYSCALL && !CONFIG_NET, they are built without the declarations leading to spurious build failures and not registered to bpf_map_types making them unavailable. Fix it by moving the BPF_MAP_TYPE for the two map types outside of CONFIG_NET. Reported-by: kernel test robot Fixes: a10787e6d58c ("bpf: Enable task local storage for tracing programs") Signed-off-by: Tejun Heo Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/YXG1cuuSJDqHQfRY@slm.duckdns.org --- include/linux/bpf_types.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9c81724e4b98..bbe1eefa4c8a 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -101,14 +101,14 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) -#ifdef CONFIG_NET -BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) +#ifdef CONFIG_NET +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) -- cgit v1.2.3 From 54713c85f536048e685258f880bf298a74c3620d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 26 Oct 2021 13:00:19 +0200 Subject: bpf: Fix potential race in tail call compatibility check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lorenzo noticed that the code testing for program type compatibility of tail call maps is potentially racy in that two threads could encounter a map with an unset type simultaneously and both return true even though they are inserting incompatible programs. The race window is quite small, but artificially enlarging it by adding a usleep_range() inside the check in bpf_prog_array_compatible() makes it trivial to trigger from userspace with a program that does, essentially: map_fd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY, 4, 4, 2, 0); pid = fork(); if (pid) { key = 0; value = xdp_fd; } else { key = 1; value = tc_fd; } err = bpf_map_update_elem(map_fd, &key, &value, 0); While the race window is small, it has potentially serious ramifications in that triggering it would allow a BPF program to tail call to a program of a different type. So let's get rid of it by protecting the update with a spinlock. The commit in the Fixes tag is the last commit that touches the code in question. v2: - Use a spinlock instead of an atomic variable and cmpxchg() (Alexei) v3: - Put lock and the members it protects into an embedded 'owner' struct (Daniel) Fixes: 3324b584b6f6 ("ebpf: misc core cleanup") Reported-by: Lorenzo Bianconi Signed-off-by: Toke HĂžiland-JĂžrgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211026110019.363464-1-toke@redhat.com --- include/linux/bpf.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 020a7d5bf470..3db6f6c95489 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -929,8 +929,11 @@ struct bpf_array_aux { * stored in the map to make sure that all callers and callees have * the same prog type and JITed flag. */ - enum bpf_prog_type type; - bool jited; + struct { + spinlock_t lock; + enum bpf_prog_type type; + bool jited; + } owner; /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; struct bpf_map *map; -- cgit v1.2.3 From cfe6807d82e97e81c3209dca9448f091e1448a57 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Oct 2021 18:58:11 +0100 Subject: gpio: Allow per-parent interrupt data The core gpiolib code is able to deal with multiple interrupt parents for a single gpio irqchip. It however only allows a single piece of data to be conveyed to all flow handlers (either the gpio_chip or some other, driver-specific data). This means that drivers have to go through some interesting dance to find the correct context, something that isn't great in interrupt context (see aebdc8abc9db86e2bd33070fc2f961012fff74b4 for a prime example). Instead, offer an optional way for a pinctrl/gpio driver to provide an array of pointers which gets used to provide the correct context to the flow handler. Signed-off-by: Marc Zyngier Signed-off-by: Joey Gouly Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20211026175815.52703-2-joey.gouly@arm.com Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index a0f9901dcae6..a673a359e20b 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -168,11 +168,18 @@ struct gpio_irq_chip { /** * @parent_handler_data: + * @parent_handler_data_array: * * Data associated, and passed to, the handler for the parent - * interrupt. + * interrupt. Can either be a single pointer if @per_parent_data + * is false, or an array of @num_parents pointers otherwise. If + * @per_parent_data is true, @parent_handler_data_array cannot be + * NULL. */ - void *parent_handler_data; + union { + void *parent_handler_data; + void **parent_handler_data_array; + }; /** * @num_parents: @@ -203,6 +210,14 @@ struct gpio_irq_chip { */ bool threaded; + /** + * @per_parent_data: + * + * True if parent_handler_data_array describes a @num_parents + * sized array to be used as parent data. + */ + bool per_parent_data; + /** * @init_hw: optional routine to initialize hardware before * an IRQ chip will be added. This is quite useful when -- cgit v1.2.3 From 7529cc7fbd9c02eda6851f3260416cbe198a321d Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 30 Dec 2020 11:41:52 +0200 Subject: lib: bitmap: Introduce node-aware alloc API Expose new node-aware API for bitmap allocation: bitmap_alloc_node() / bitmap_zalloc_node(). Signed-off-by: Tariq Toukan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/bitmap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 37f36dad18bd..a241dcf50f39 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -123,6 +123,8 @@ struct device; */ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); +unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node); +unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node); void bitmap_free(const unsigned long *bitmap); /* Managed variants of the above. */ -- cgit v1.2.3 From 50f477fe9933193e960785f1192be801d7cd307a Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Thu, 2 Jul 2020 17:22:45 +0300 Subject: net/mlx5e: Rename lro_timeout to packet_merge_timeout TIR stands for transport interface receive, the TIR object is responsible for performing all transport related operations on the receive side like packet processing, demultiplexing the packets to different RQ's, etc. lro_timeout is a field in the TIR that is used to set the timeout for lro session, this series introduces new packet merge type, therefore rename lro_timeout to packet_merge_timeout for all packet merge types. Signed-off-by: Ben Ben-Ishay Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 746381eccccf..21c0fd478afa 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3361,8 +3361,8 @@ enum { }; enum { - MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO = 0x1, - MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO = 0x2, + MLX5_TIRC_PACKET_MERGE_MASK_IPV4_LRO = BIT(0), + MLX5_TIRC_PACKET_MERGE_MASK_IPV6_LRO = BIT(1), }; enum { @@ -3387,7 +3387,7 @@ struct mlx5_ifc_tirc_bits { u8 reserved_at_80[0x4]; u8 lro_timeout_period_usecs[0x10]; - u8 lro_enable_mask[0x4]; + u8 packet_merge_mask[0x4]; u8 lro_max_ip_payload_size[0x8]; u8 reserved_at_a0[0x40]; -- cgit v1.2.3 From 7025329d208cae45937d2a0910786a45b9981475 Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 9 Sep 2020 17:36:39 +0300 Subject: net/mlx5: Add SHAMPO caps, HW bits and enumerations This commit adds SHAMPO bit to hca_cap and SHAMPO capabilities structure, SHAMPO related HW spec hardware fields and enumerations. SHAMPO stands for: split headers and merge payload offload. SHAMPO new fields: WQ: - headers_mkey: mkey that represents the headers buffer, where the packets headers will be written by the HW. - shampo_enable: flag to verify if the WQ supports SHAMPO feature. - log_reservation_size: the log of the reservation size where the data of the packet will be written by the HW. - log_max_num_of_packets_per_reservation: log of the maximum number of packets that can be written to the same reservation. - log_headers_entry_size: log of the header entry size of the headers buffer. - log_headers_buffer_entry_num: log of the entries number of the headers buffer. RQ: - shampo_no_match_alignment_granularity: the HW alignment granularity in case the received packet doesn't match the current session. - shampo_match_criteria_type: the type of match criteria. - reservation_timeout: the maximum time that the HW will hold the reservation. mlx5_ifc_shampo_cap_bits, the capabilities of the SHAMPO feature: - shampo_log_max_reservation_size: the maximum allowed value of the field WQ.log_reservation_size. - log_reservation_size: the minimum allowed value of the field WQ.log_reservation_size. - shampo_min_mss_size: the minimum payload size of packet that can open a new session or be merged to a session. - shampo_max_log_headers_entry_size: the maximum allowed value of the field WQ.log_headers_entry_size Signed-off-by: Ben Ben-Ishay Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 4 ++++ include/linux/mlx5/mlx5_ifc.h | 56 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index f8a0bbb42c3b..0d30a6184e1d 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1186,6 +1186,7 @@ enum mlx5_cap_type { MLX5_CAP_VDPA_EMULATION = 0x13, MLX5_CAP_DEV_EVENT = 0x14, MLX5_CAP_IPSEC, + MLX5_CAP_DEV_SHAMPO = 0x1d, MLX5_CAP_GENERAL_2 = 0x20, MLX5_CAP_PORT_SELECTION = 0x25, /* NUM OF CAP Types */ @@ -1431,6 +1432,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_IPSEC(mdev, cap)\ MLX5_GET(ipsec_cap, (mdev)->caps.hca[MLX5_CAP_IPSEC]->cur, cap) +#define MLX5_CAP_DEV_SHAMPO(mdev, cap)\ + MLX5_GET(shampo_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_SHAMPO], cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 21c0fd478afa..f1c134af5fcf 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1350,7 +1350,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_b0[0x1]; u8 uplink_follow[0x1]; u8 ts_cqe_to_dest_cqn[0x1]; - u8 reserved_at_b3[0xd]; + u8 reserved_at_b3[0x7]; + u8 shampo[0x1]; + u8 reserved_at_bb[0x5]; u8 max_sgl_for_optimized_performance[0x8]; u8 log_max_cq_sz[0x8]; @@ -1893,7 +1895,21 @@ struct mlx5_ifc_wq_bits { u8 reserved_at_139[0x4]; u8 log_wqe_stride_size[0x3]; - u8 reserved_at_140[0x4c0]; + u8 reserved_at_140[0x80]; + + u8 headers_mkey[0x20]; + + u8 shampo_enable[0x1]; + u8 reserved_at_1e1[0x4]; + u8 log_reservation_size[0x3]; + u8 reserved_at_1e8[0x5]; + u8 log_max_num_of_packets_per_reservation[0x3]; + u8 reserved_at_1f0[0x6]; + u8 log_headers_entry_size[0x2]; + u8 reserved_at_1f8[0x4]; + u8 log_headers_buffer_entry_num[0x4]; + + u8 reserved_at_200[0x400]; struct mlx5_ifc_cmd_pas_bits pas[]; }; @@ -3169,6 +3185,20 @@ struct mlx5_ifc_roce_addr_layout_bits { u8 reserved_at_e0[0x20]; }; +struct mlx5_ifc_shampo_cap_bits { + u8 reserved_at_0[0x3]; + u8 shampo_log_max_reservation_size[0x5]; + u8 reserved_at_8[0x3]; + u8 shampo_log_min_reservation_size[0x5]; + u8 shampo_min_mss_size[0x10]; + + u8 reserved_at_20[0x3]; + u8 shampo_max_log_headers_entry_size[0x5]; + u8 reserved_at_28[0x18]; + + u8 reserved_at_40[0x7c0]; +}; + union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap; struct mlx5_ifc_cmd_hca_cap_2_bits cmd_hca_cap_2; @@ -3187,6 +3217,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_tls_cap_bits tls_cap; struct mlx5_ifc_device_mem_cap_bits device_mem_cap; struct mlx5_ifc_virtio_emulation_cap_bits virtio_emulation_cap; + struct mlx5_ifc_shampo_cap_bits shampo_cap; u8 reserved_at_0[0x8000]; }; @@ -3363,6 +3394,7 @@ enum { enum { MLX5_TIRC_PACKET_MERGE_MASK_IPV4_LRO = BIT(0), MLX5_TIRC_PACKET_MERGE_MASK_IPV6_LRO = BIT(1), + MLX5_TIRC_PACKET_MERGE_MASK_SHAMPO = BIT(2), }; enum { @@ -3569,6 +3601,18 @@ enum { MLX5_RQC_STATE_ERR = 0x3, }; +enum { + MLX5_RQC_SHAMPO_NO_MATCH_ALIGNMENT_GRANULARITY_BYTE = 0x0, + MLX5_RQC_SHAMPO_NO_MATCH_ALIGNMENT_GRANULARITY_STRIDE = 0x1, + MLX5_RQC_SHAMPO_NO_MATCH_ALIGNMENT_GRANULARITY_PAGE = 0x2, +}; + +enum { + MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_NO_MATCH = 0x0, + MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_EXTENDED = 0x1, + MLX5_RQC_SHAMPO_MATCH_CRITERIA_TYPE_FIVE_TUPLE = 0x2, +}; + struct mlx5_ifc_rqc_bits { u8 rlky[0x1]; u8 delay_drop_en[0x1]; @@ -3601,7 +3645,13 @@ struct mlx5_ifc_rqc_bits { u8 reserved_at_c0[0x10]; u8 hairpin_peer_vhca[0x10]; - u8 reserved_at_e0[0xa0]; + u8 reserved_at_e0[0x46]; + u8 shampo_no_match_alignment_granularity[0x2]; + u8 reserved_at_128[0x6]; + u8 shampo_match_criteria_type[0x2]; + u8 reservation_timeout[0x10]; + + u8 reserved_at_140[0x40]; struct mlx5_ifc_wq_bits wq; }; -- cgit v1.2.3 From eaee12f046924eeb1210c7e4f3b326603ff1bd85 Mon Sep 17 00:00:00 2001 From: Khalid Manaa Date: Wed, 9 Jun 2021 12:27:32 +0300 Subject: net/mlx5e: Rename TIR lro functions to TIR packet merge functions This series introduces new packet merge type, therefore rename lro functions to packet merge to support the new merge type: - Generalize + rename mlx5e_build_tir_ctx_lro to mlx5e_build_tir_ctx_packet_merge. - Rename mlx5e_modify_tirs_lro to mlx5e_modify_tirs_packet_merge. - Rename lro bit in mlx5_ifc_modify_tir_bitmask_bits to packet_merge. - Rename lro_en in mlx5e_params to packet_merge_type type and combine packet_merge params into one struct mlx5e_packet_merge_param. Signed-off-by: Khalid Manaa Signed-off-by: Ben Ben-Ishay Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f1c134af5fcf..0bb78c04336c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -6707,7 +6707,7 @@ struct mlx5_ifc_modify_tir_bitmask_bits { u8 reserved_at_3c[0x1]; u8 hash[0x1]; u8 reserved_at_3e[0x1]; - u8 lro[0x1]; + u8 packet_merge[0x1]; }; struct mlx5_ifc_modify_tir_out_bits { -- cgit v1.2.3 From d7b896acbdcb3ef5dab1fd2f33ba5a8da6ba1dda Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Tue, 14 Jul 2020 14:40:32 +0300 Subject: net/mlx5e: Add support to klm_umr_wqe This commit adds the needed definitions for using the klm_umr_wqe. UMR stands for user-mode memory registration, is a mechanism to alter address translation properties of MKEY by posting WorkQueueElement aka WQE on send queue. MKEY stands for memory key, MKEY are used to describe a region in memory that can be later used by HW. KLM stands for {Key, Length, MemVa}, KLM_MKEY is indirect MKEY that enables to map multiple memory spaces with different sizes in unified MKEY. klm_umr_wqe is a UMR that use to update a KLM_MKEY. SHAMPO feature uses KLM_MKEY for memory registration of his header buffer. Signed-off-by: Ben Ben-Ishay Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0d30a6184e1d..c920e5932368 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -290,6 +290,7 @@ enum { MLX5_UMR_INLINE = (1 << 7), }; +#define MLX5_UMR_KLM_ALIGNMENT 4 #define MLX5_UMR_MTT_ALIGNMENT 0x40 #define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT -- cgit v1.2.3 From f97d5c2a453e26071e3b0ec12161de57c4a237c4 Mon Sep 17 00:00:00 2001 From: Khalid Manaa Date: Tue, 19 May 2020 15:45:38 +0300 Subject: net/mlx5e: Add handle SHAMPO cqe support This patch adds the new CQE SHAMPO fields: - flush: indicates that we must close the current session and pass the SKB to the network stack. - match: indicates that the current packet matches the oppened session, the packet will be merge into the current SKB. - header_size: the size of the packet headers that written into the headers buffer. - header_entry_index: the entry index in the headers buffer. - data_offset: packets data offset in the WQE. Also new cqe handler is added to handle SHAMPO packets: - The new handler uses CQE SHAMPO fields to build the SKB. CQE's Flush and match fields are not used in this patch, packets are not merged in this patch. Signed-off-by: Khalid Manaa Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index c920e5932368..56bcf95d4ab7 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -800,10 +800,23 @@ struct mlx5_cqe64 { u8 tls_outer_l3_tunneled; u8 rsvd0; __be16 wqe_id; - u8 lro_tcppsh_abort_dupack; - u8 lro_min_ttl; - __be16 lro_tcp_win; - __be32 lro_ack_seq_num; + union { + struct { + u8 tcppsh_abort_dupack; + u8 min_ttl; + __be16 tcp_win; + __be32 ack_seq_num; + } lro; + struct { + u8 reserved0:1; + u8 match:1; + u8 flush:1; + u8 reserved3:5; + u8 header_size; + __be16 header_entry_index; + __be32 data_offset; + } shampo; + }; __be32 rss_hash_result; u8 rss_hash_type; u8 ml_path; @@ -873,7 +886,7 @@ static inline u8 get_cqe_opcode(struct mlx5_cqe64 *cqe) static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe) { - return (cqe->lro_tcppsh_abort_dupack >> 6) & 1; + return (cqe->lro.tcppsh_abort_dupack >> 6) & 1; } static inline u8 get_cqe_l4_hdr_type(struct mlx5_cqe64 *cqe) -- cgit v1.2.3 From a2247f19ee1c5ad75ef095cdfb909a3244b88aa8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 27 Oct 2021 11:22:19 +0900 Subject: block: Add independent access ranges support The Concurrent Positioning Ranges VPD page (for SCSI) and data log page (for ATA) contain parameters describing the set of contiguous LBAs that can be served independently by a single LUN multi-actuator hard-disk. Similarly, a logically defined block device composed of multiple disks can in some cases execute requests directed at different sector ranges in parallel. A dm-linear device aggregating 2 block devices together is an example. This patch implements support for exposing a block device independent access ranges to the user through sysfs to allow optimizing device accesses to increase performance. To describe the set of independent sector ranges of a device (actuators of a multi-actuator HDDs or table entries of a dm-linear device), The type struct blk_independent_access_ranges is introduced. This structure describes the sector ranges using an array of struct blk_independent_access_range structures. This range structure defines the start sector and number of sectors of the access range. The ranges in the array cannot overlap and must contain all sectors within the device capacity. The function disk_set_independent_access_ranges() allows a device driver to signal to the block layer that a device has multiple independent access ranges. In this case, a struct blk_independent_access_ranges is attached to the device request queue by the function disk_set_independent_access_ranges(). The function disk_alloc_independent_access_ranges() is provided for drivers to allocate this structure. struct blk_independent_access_ranges contains kobjects (struct kobject) to expose to the user through sysfs the set of independent access ranges supported by a device. When the device is initialized, sysfs registration of the ranges information is done from blk_register_queue() using the block layer internal function disk_register_independent_access_ranges(). If a driver calls disk_set_independent_access_ranges() for a registered queue, e.g. when a device is revalidated, disk_set_independent_access_ranges() will execute disk_register_independent_access_ranges() to update the sysfs attribute files. The sysfs file structure created starts from the independent_access_ranges sub-directory and contains the start sector and number of sectors of each range, with the information for each range grouped in numbered sub-directories. E.g. for a dual actuator HDD, the user sees: $ tree /sys/block/sdk/queue/independent_access_ranges/ /sys/block/sdk/queue/independent_access_ranges/ |-- 0 | |-- nr_sectors | `-- sector `-- 1 |-- nr_sectors `-- sector For a regular device with a single access range, the independent_access_ranges sysfs directory does not exist. Device revalidation may lead to changes to this structure and to the attribute values. When manipulated, the queue sysfs_lock and sysfs_dir_lock mutexes are held for atomicity, similarly to how the blk-mq and elevator sysfs queue sub-directories are protected. The code related to the management of independent access ranges is added in the new file block/blk-ia-ranges.c. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20211027022223.183838-2-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f72ccb2829db..6d95a4b36cfa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -150,6 +150,34 @@ static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, #endif /* CONFIG_BLK_DEV_ZONED */ +/* + * Independent access ranges: struct blk_independent_access_range describes + * a range of contiguous sectors that can be accessed using device command + * execution resources that are independent from the resources used for + * other access ranges. This is typically found with single-LUN multi-actuator + * HDDs where each access range is served by a different set of heads. + * The set of independent ranges supported by the device is defined using + * struct blk_independent_access_ranges. The independent ranges must not overlap + * and must include all sectors within the disk capacity (no sector holes + * allowed). + * For a device with multiple ranges, requests targeting sectors in different + * ranges can be executed in parallel. A request can straddle an access range + * boundary. + */ +struct blk_independent_access_range { + struct kobject kobj; + struct request_queue *queue; + sector_t sector; + sector_t nr_sectors; +}; + +struct blk_independent_access_ranges { + struct kobject kobj; + bool sysfs_registered; + unsigned int nr_ia_ranges; + struct blk_independent_access_range ia_range[]; +}; + struct request_queue { struct request *last_merge; struct elevator_queue *elevator; @@ -331,6 +359,12 @@ struct request_queue { #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; + + /* + * Independent sector access ranges. This is always NULL for + * devices that do not have multiple independent access ranges. + */ + struct blk_independent_access_ranges *ia_ranges; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ @@ -698,6 +732,11 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); +struct blk_independent_access_ranges * +disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); +void disk_set_independent_access_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *iars); + /* * Elevator features for blk_queue_required_elevator_features: */ -- cgit v1.2.3 From fe22e1c2f705676a705d821301fc52eecc2fe055 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 27 Oct 2021 11:22:21 +0900 Subject: libata: support concurrent positioning ranges log Add support to discover if an ATA device supports the Concurrent Positioning Ranges data log (address 0x47), indicating that the device is capable of seeking to multiple different locations in parallel using multiple actuators serving different LBA ranges. Also add support to translate the concurrent positioning ranges log into its equivalent Concurrent Positioning Ranges VPD page B9h in libata-scsi.c. The format of the Concurrent Positioning Ranges Log is defined in ACS-5 r9. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20211027022223.183838-4-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- include/linux/ata.h | 1 + include/linux/libata.h | 15 +++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ata.h b/include/linux/ata.h index 1b44f40c7700..199e47e97d64 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -329,6 +329,7 @@ enum { ATA_LOG_SECURITY = 0x06, ATA_LOG_SATA_SETTINGS = 0x08, ATA_LOG_ZONED_INFORMATION = 0x09, + ATA_LOG_CONCURRENT_POSITIONING_RANGES = 0x47, /* Identify device SATA settings log:*/ ATA_LOG_DEVSLP_OFFSET = 0x30, diff --git a/include/linux/libata.h b/include/linux/libata.h index c0c64f03e107..236ec689056a 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -676,6 +676,18 @@ struct ata_ering { struct ata_ering_entry ring[ATA_ERING_SIZE]; }; +struct ata_cpr { + u8 num; + u8 num_storage_elements; + u64 start_lba; + u64 num_lbas; +}; + +struct ata_cpr_log { + u8 nr_cpr; + struct ata_cpr cpr[]; +}; + struct ata_device { struct ata_link *link; unsigned int devno; /* 0 or 1 */ @@ -735,6 +747,9 @@ struct ata_device { u32 zac_zones_optimal_nonseq; u32 zac_zones_max_open; + /* Concurrent positioning ranges */ + struct ata_cpr_log *cpr_log; + /* error history */ int spdn_cnt; /* ering is CLEAR_END, read comment above CLEAR_END */ -- cgit v1.2.3 From 785d584c30ffc1224027536fe55bdc15ee509f14 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 18 Oct 2021 17:21:37 +0200 Subject: nvme: add new discovery log page entry definitions TP8014 adds a new SUBTYPE value and a new field EFLAGS for the discovery log page entry. Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 357482dedb59..855dd9b3e84b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -27,8 +27,14 @@ #define NVME_NSID_ALL 0xffffffff enum nvme_subsys_type { - NVME_NQN_DISC = 1, /* Discovery type target subsystem */ - NVME_NQN_NVME = 2, /* NVME type target subsystem */ + /* Referral to another discovery type target subsystem */ + NVME_NQN_DISC = 1, + + /* NVME type target subsystem */ + NVME_NQN_NVME = 2, + + /* Current discovery type target subsystem */ + NVME_NQN_CURR = 3, }; enum nvme_ctrl_type { @@ -1312,6 +1318,12 @@ struct nvmf_common_command { #define MAX_DISC_LOGS 255 +/* Discovery log page entry flags (EFLAGS): */ +enum { + NVME_DISC_EFLAGS_EPCSD = (1 << 1), + NVME_DISC_EFLAGS_DUPRETINFO = (1 << 0), +}; + /* Discovery log page entry */ struct nvmf_disc_rsp_page_entry { __u8 trtype; @@ -1321,7 +1333,8 @@ struct nvmf_disc_rsp_page_entry { __le16 portid; __le16 cntlid; __le16 asqsz; - __u8 resv8[22]; + __le16 eflags; + __u8 resv10[20]; char trsvcid[NVMF_TRSVCID_SIZE]; __u8 resv64[192]; char subnqn[NVMF_NQN_FIELD_LEN]; -- cgit v1.2.3 From 33f98a9798f55fd77c36ce79d8cfa5329e55a789 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 13 Oct 2021 10:57:41 -0700 Subject: x86/boot/compressed: Avoid duplicate malloc() implementations The early malloc() and free() implementation in include/linux/decompress/mm.h (which is also included by the static decompressors) is static. This is fine when the only thing interested in using malloc() is the decompression code, but the x86 early boot environment may use malloc() in a couple places, leading to a potential collision when the static copies of the available memory region ("malloc_ptr") gets reset to the global "free_mem_ptr" value. As it happened, the existing usage pattern was accidentally safe because each user did 1 malloc() and 1 free() before returning and were not nested: extract_kernel() (misc.c) choose_random_location() (kaslr.c) mem_avoid_init() handle_mem_options() malloc() ... free() ... parse_elf() (misc.c) malloc() ... free() Once the future FGKASLR series is added, however, it will insert additional malloc() calls local to fgkaslr.c in the middle of parse_elf()'s malloc()/free() pair: parse_elf() (misc.c) malloc() if (...) { layout_randomized_image(output, &ehdr, phdrs); malloc() <- boom ... else layout_image(output, &ehdr, phdrs); free() To avoid collisions, there must be a single implementation of malloc(). Adjust include/linux/decompress/mm.h so that visibility can be controlled, provide prototypes in misc.h, and implement the functions in misc.c. This also results in a small size savings: $ size vmlinux.before vmlinux.after text data bss dec hex filename 8842314 468 178320 9021102 89a6ae vmlinux.before 8842240 468 178320 9021028 89a664 vmlinux.after Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211013175742.1197608-4-keescook@chromium.org --- include/linux/decompress/mm.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/decompress/mm.h b/include/linux/decompress/mm.h index 868e9eacd69e..9192986b1a73 100644 --- a/include/linux/decompress/mm.h +++ b/include/linux/decompress/mm.h @@ -25,13 +25,21 @@ #define STATIC_RW_DATA static #endif +/* + * When an architecture needs to share the malloc()/free() implementation + * between compilation units, it needs to have non-local visibility. + */ +#ifndef MALLOC_VISIBLE +#define MALLOC_VISIBLE static +#endif + /* A trivial malloc implementation, adapted from * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 */ STATIC_RW_DATA unsigned long malloc_ptr; STATIC_RW_DATA int malloc_count; -static void *malloc(int size) +MALLOC_VISIBLE void *malloc(int size) { void *p; @@ -52,7 +60,7 @@ static void *malloc(int size) return p; } -static void free(void *where) +MALLOC_VISIBLE void free(void *where) { malloc_count--; if (!malloc_count) -- cgit v1.2.3 From 9baf93d68bcc3d0a6042283b82603c076e25e4f5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 25 Oct 2021 16:27:16 -0300 Subject: fsnotify: pass data_type to fsnotify_name() Align the arguments of fsnotify_name() to those of fsnotify(). Link: https://lore.kernel.org/r/20211025192746.66445-2-krisman@collabora.com Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 12d3a7d308ab..d1144d7c3536 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -26,20 +26,21 @@ * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only * the child is interested and not the parent. */ -static inline void fsnotify_name(struct inode *dir, __u32 mask, - struct inode *child, - const struct qstr *name, u32 cookie) +static inline int fsnotify_name(__u32 mask, const void *data, int data_type, + struct inode *dir, const struct qstr *name, + u32 cookie) { if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0) - return; + return 0; - fsnotify(mask, child, FSNOTIFY_EVENT_INODE, dir, name, NULL, cookie); + return fsnotify(mask, data, data_type, dir, name, NULL, cookie); } static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, __u32 mask) { - fsnotify_name(dir, mask, d_inode(dentry), &dentry->d_name, 0); + fsnotify_name(mask, d_inode(dentry), FSNOTIFY_EVENT_INODE, + dir, &dentry->d_name, 0); } static inline void fsnotify_inode(struct inode *inode, __u32 mask) @@ -154,8 +155,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, new_dir_mask |= FS_ISDIR; } - fsnotify_name(old_dir, old_dir_mask, source, old_name, fs_cookie); - fsnotify_name(new_dir, new_dir_mask, source, new_name, fs_cookie); + fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE, + old_dir, old_name, fs_cookie); + fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE, + new_dir, new_name, fs_cookie); if (target) fsnotify_link_count(target); @@ -209,7 +212,8 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, fsnotify_link_count(inode); audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE); - fsnotify_name(dir, FS_CREATE, inode, &new_dentry->d_name, 0); + fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE, + dir, &new_dentry->d_name, 0); } /* -- cgit v1.2.3 From fd5a3ff49a19aa69e2bc1e26e98037c2d778e61a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 25 Oct 2021 16:27:17 -0300 Subject: fsnotify: pass dentry instead of inode data Define a new data type to pass for event - FSNOTIFY_EVENT_DENTRY. Use it to pass the dentry instead of it's ->d_inode where available. This is needed in preparation to the refactor to retrieve the super block from the data field. In some cases (i.e. mkdir in kernfs), the data inode comes from a negative dentry, such that no super block information would be available. By receiving the dentry itself, instead of the inode, fsnotify can derive the super block even on these cases. Link: https://lore.kernel.org/r/20211025192746.66445-3-krisman@collabora.com Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein [Expand explanation in commit message] Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify.h | 5 ++--- include/linux/fsnotify_backend.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index d1144d7c3536..df0fa4687a18 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -39,8 +39,7 @@ static inline int fsnotify_name(__u32 mask, const void *data, int data_type, static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry, __u32 mask) { - fsnotify_name(mask, d_inode(dentry), FSNOTIFY_EVENT_INODE, - dir, &dentry->d_name, 0); + fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0); } static inline void fsnotify_inode(struct inode *inode, __u32 mask) @@ -87,7 +86,7 @@ notify_child: */ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) { - fsnotify_parent(dentry, mask, d_inode(dentry), FSNOTIFY_EVENT_INODE); + fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY); } static inline int fsnotify_file(struct file *file, __u32 mask) diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 1ce66748a2d2..a2db821e8a8f 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -248,6 +248,7 @@ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, + FSNOTIFY_EVENT_DENTRY, }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) @@ -255,6 +256,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) switch (data_type) { case FSNOTIFY_EVENT_INODE: return (struct inode *)data; + case FSNOTIFY_EVENT_DENTRY: + return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); default: @@ -262,6 +265,19 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) } } +static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_DENTRY: + /* Non const is needed for dget() */ + return (struct dentry *)data; + case FSNOTIFY_EVENT_PATH: + return ((const struct path *)data)->dentry; + default: + return NULL; + } +} + static inline const struct path *fsnotify_data_path(const void *data, int data_type) { -- cgit v1.2.3 From dabe729dddca550446e9cc118c96d1f91703345b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 25 Oct 2021 16:27:18 -0300 Subject: fsnotify: clarify contract for create event hooks Clarify argument names and contract for fsnotify_create() and fsnotify_mkdir() to reflect the anomaly of kernfs, which leaves dentries negavite after mkdir/create. Remove the WARN_ON(!inode) in audit code that were added by the Fixes commit under the wrong assumption that dentries cannot be negative after mkdir/create. Fixes: aa93bdc5500c ("fsnotify: use helpers to access data by data_type") Link: https://lore.kernel.org/linux-fsdevel/87mtp5yz0q.fsf@collabora.com/ Link: https://lore.kernel.org/r/20211025192746.66445-4-krisman@collabora.com Reviewed-by: Jan Kara Reported-by: Gabriel Krisman Bertazi Signed-off-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index df0fa4687a18..1e5f7435a4b5 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -192,16 +192,22 @@ static inline void fsnotify_inoderemove(struct inode *inode) /* * fsnotify_create - 'name' was linked in + * + * Caller must make sure that dentry->d_name is stable. + * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate + * ->d_inode later */ -static inline void fsnotify_create(struct inode *inode, struct dentry *dentry) +static inline void fsnotify_create(struct inode *dir, struct dentry *dentry) { - audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE); + audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); - fsnotify_dirent(inode, dentry, FS_CREATE); + fsnotify_dirent(dir, dentry, FS_CREATE); } /* * fsnotify_link - new hardlink in 'inode' directory + * + * Caller must make sure that new_dentry->d_name is stable. * Note: We have to pass also the linked inode ptr as some filesystems leave * new_dentry->d_inode NULL and instantiate inode pointer later */ @@ -230,12 +236,16 @@ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry) /* * fsnotify_mkdir - directory 'name' was created + * + * Caller must make sure that dentry->d_name is stable. + * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate + * ->d_inode later */ -static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) +static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry) { - audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE); + audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE); - fsnotify_dirent(inode, dentry, FS_CREATE | FS_ISDIR); + fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR); } /* -- cgit v1.2.3 From 808967a0a4d2f4ce6a2005c5692fffbecaf018c1 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:23 -0300 Subject: fsnotify: Add helper to detect overflow_event Similarly to fanotify_is_perm_event and friends, provide a helper predicate to say whether a mask is of an overflow event. Link: https://lore.kernel.org/r/20211025192746.66445-9-krisman@collabora.com Suggested-by: Amir Goldstein Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index a2db821e8a8f..749bc85e1d1c 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -510,6 +510,11 @@ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) fsnotify_add_event(group, group->overflow_event, NULL, NULL); } +static inline bool fsnotify_is_overflow_event(u32 mask) +{ + return mask & FS_Q_OVERFLOW; +} + static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { assert_spin_locked(&group->notification_lock); -- cgit v1.2.3 From 1ad03c3a326a86e259389592117252c851873395 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:24 -0300 Subject: fsnotify: Add wrapper around fsnotify_add_event fsnotify_add_event is growing in number of parameters, which in most case are just passed a NULL pointer. So, split out a new fsnotify_insert_event function to clean things up for users who don't need an insert hook. Link: https://lore.kernel.org/r/20211025192746.66445-10-krisman@collabora.com Suggested-by: Amir Goldstein Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 749bc85e1d1c..b323d0c4b967 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -498,16 +498,25 @@ extern int fsnotify_fasync(int fd, struct file *file, int on); extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ -extern int fsnotify_add_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct fsnotify_group *, - struct fsnotify_event *), - void (*insert)(struct fsnotify_group *, - struct fsnotify_event *)); +extern int fsnotify_insert_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *), + void (*insert)(struct fsnotify_group *, + struct fsnotify_event *)); + +static inline int fsnotify_add_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *)) +{ + return fsnotify_insert_event(group, event, merge, NULL); +} + /* Queue overflow event to a notification group */ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) { - fsnotify_add_event(group, group->overflow_event, NULL, NULL); + fsnotify_add_event(group, group->overflow_event, NULL); } static inline bool fsnotify_is_overflow_event(u32 mask) -- cgit v1.2.3 From 29335033c574a15334015d8c4e36862cff3d3384 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:25 -0300 Subject: fsnotify: Retrieve super block from the data field Some file system events (i.e. FS_ERROR) might not be associated with an inode or directory. For these, we can retrieve the super block from the data field. But, since the super_block is available in the data field on every event type, simplify the code to always retrieve it from there, through a new helper. Link: https://lore.kernel.org/r/20211025192746.66445-11-krisman@collabora.com Suggested-by: Jan Kara Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index b323d0c4b967..035438fe4a43 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -289,6 +289,21 @@ static inline const struct path *fsnotify_data_path(const void *data, } } +static inline struct super_block *fsnotify_data_sb(const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_INODE: + return ((struct inode *)data)->i_sb; + case FSNOTIFY_EVENT_DENTRY: + return ((struct dentry *)data)->d_sb; + case FSNOTIFY_EVENT_PATH: + return ((const struct path *)data)->dentry->d_sb; + default: + return NULL; + } +} + enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_PARENT, -- cgit v1.2.3 From 24dca90590509a7a6cbe0650100c90c5b8a3468a Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:26 -0300 Subject: fsnotify: Protect fsnotify_handle_inode_event from no-inode events FAN_FS_ERROR allows events without inodes - i.e. for file system-wide errors. Even though fsnotify_handle_inode_event is not currently used by fanotify, this patch protects other backends from cases where neither inode or dir are provided. Also document the constraints of the interface (inode and dir cannot be both NULL). Link: https://lore.kernel.org/r/20211025192746.66445-12-krisman@collabora.com Suggested-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 035438fe4a43..b71dc788018e 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -136,6 +136,7 @@ struct mem_cgroup; * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to. + * Either @inode or @dir must be non-NULL. * @file_name: optional file name associated with event * @cookie: inotify rename cookie * -- cgit v1.2.3 From 330ae77d2a5b0af32c0f29e139bf28ec8591de59 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:27 -0300 Subject: fsnotify: Pass group argument to free_event For group-wide mempool backed events, like FS_ERROR, the free_event callback will need to reference the group's mempool to free the memory. Wire that argument into the current callers. Link: https://lore.kernel.org/r/20211025192746.66445-13-krisman@collabora.com Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index b71dc788018e..3a7c31436182 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -156,7 +156,7 @@ struct fsnotify_ops { const struct qstr *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); - void (*free_event)(struct fsnotify_event *event); + void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event); /* called on final put+free to free memory */ void (*free_mark)(struct fsnotify_mark *mark); }; -- cgit v1.2.3 From 4fe595cf1c80e7a5af4d00c4da29def64aff57a2 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:31 -0300 Subject: fanotify: Require fid_mode for any non-fd event Like inode events, FAN_FS_ERROR will require fid mode. Therefore, convert the verification during fanotify_mark(2) to require fid for any non-fd event. This means fid_mode will not only be required for inode events, but for any event that doesn't provide a descriptor. Link: https://lore.kernel.org/r/20211025192746.66445-17-krisman@collabora.com Suggested-by: Amir Goldstein Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fanotify.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index eec3b7c40811..52d464802d99 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -84,6 +84,9 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ */ #define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE) +/* Events that can be reported with event->fd */ +#define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS) + /* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */ #define FANOTIFY_INODE_EVENTS (FANOTIFY_DIRENT_EVENTS | \ FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF) -- cgit v1.2.3 From 9daa811073fa19c08e8aad3b90f9235fed161acf Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:32 -0300 Subject: fsnotify: Support FS_ERROR event type Expose a new type of fsnotify event for filesystems to report errors for userspace monitoring tools. fanotify will send this type of notification for FAN_FS_ERROR events. This also introduce a helper for generating the new event. Link: https://lore.kernel.org/r/20211025192746.66445-18-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify.h | 13 +++++++++++++ include/linux/fsnotify_backend.h | 32 +++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 1e5f7435a4b5..787545e87eeb 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -339,4 +339,17 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid) fsnotify_dentry(dentry, mask); } +static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, + int error) +{ + struct fs_error_report report = { + .error = error, + .inode = inode, + .sb = sb, + }; + + return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, + NULL, NULL, NULL, 0); +} + #endif /* _LINUX_FS_NOTIFY_H */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 3a7c31436182..00dbaafbcf95 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -42,6 +42,12 @@ #define FS_UNMOUNT 0x00002000 /* inode on umount fs */ #define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ +#define FS_ERROR 0x00008000 /* Filesystem Error (fanotify) */ + +/* + * FS_IN_IGNORED overloads FS_ERROR. It is only used internally by inotify + * which does not support FS_ERROR. + */ #define FS_IN_IGNORED 0x00008000 /* last inotify event here */ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ @@ -95,7 +101,8 @@ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \ - FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED) + FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ + FS_ERROR) /* Extra flags that may be reported with event or control handling of events */ #define ALL_FSNOTIFY_FLAGS (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \ @@ -250,6 +257,13 @@ enum fsnotify_data_type { FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, + FSNOTIFY_EVENT_ERROR, +}; + +struct fs_error_report { + int error; + struct inode *inode; + struct super_block *sb; }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) @@ -261,6 +275,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); + case FSNOTIFY_EVENT_ERROR: + return ((struct fs_error_report *)data)->inode; default: return NULL; } @@ -300,6 +316,20 @@ static inline struct super_block *fsnotify_data_sb(const void *data, return ((struct dentry *)data)->d_sb; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry->d_sb; + case FSNOTIFY_EVENT_ERROR: + return ((struct fs_error_report *) data)->sb; + default: + return NULL; + } +} + +static inline struct fs_error_report *fsnotify_data_error_report( + const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_ERROR: + return (struct fs_error_report *) data; default: return NULL; } -- cgit v1.2.3 From 734a1a5eccc5f7473002b0669f788e135f1f64aa Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:34 -0300 Subject: fanotify: Pre-allocate pool of error events Pre-allocate slots for file system errors to have greater chances of succeeding, since error events can happen in GFP_NOFS context. This patch introduces a group-wide mempool of error events, shared by all FAN_FS_ERROR marks in this group. Link: https://lore.kernel.org/r/20211025192746.66445-20-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 00dbaafbcf95..51ef2b079bfa 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -19,6 +19,7 @@ #include #include #include +#include /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily @@ -246,6 +247,7 @@ struct fsnotify_group { int flags; /* flags from fanotify_init() */ int f_flags; /* event_f_flags from fanotify_init() */ struct ucounts *ucounts; + mempool_t error_events_pool; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; -- cgit v1.2.3 From 9709bd548f11a092d124698118013f66e1740f9b Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 25 Oct 2021 16:27:43 -0300 Subject: fanotify: Allow users to request FAN_FS_ERROR events Wire up the FAN_FS_ERROR event in the fanotify_mark syscall, allowing user space to request the monitoring of FAN_FS_ERROR events. These events are limited to filesystem marks, so check it is the case in the syscall handler. Link: https://lore.kernel.org/r/20211025192746.66445-29-krisman@collabora.com Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jan Kara --- include/linux/fanotify.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 52d464802d99..616af2ea20f3 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -91,9 +91,13 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ #define FANOTIFY_INODE_EVENTS (FANOTIFY_DIRENT_EVENTS | \ FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF) +/* Events that can only be reported with data type FSNOTIFY_EVENT_ERROR */ +#define FANOTIFY_ERROR_EVENTS (FAN_FS_ERROR) + /* Events that user can request to be notified on */ #define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \ - FANOTIFY_INODE_EVENTS) + FANOTIFY_INODE_EVENTS | \ + FANOTIFY_ERROR_EVENTS) /* Events that require a permission response from user */ #define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \ -- cgit v1.2.3 From 1bb6b81029456f4e2e6727c5167f43bdfc34bee5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 27 Oct 2021 13:21:07 +0100 Subject: block: avoid extra iter advance with async iocb Nobody cares about iov iterators state if we return -EIOCBQUEUED, so as the we now have __blkdev_direct_IO_async(), which gets pages only once, we can skip expensive iov_iter_advance(). It's around 1-2% of all CPU spent. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a6158edfbfa2ae3bc24aed29a72f035df18fad2f.1635337135.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index c88700d1bdc3..fe6bdfbbef66 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -417,6 +417,7 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page, void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); +void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -- cgit v1.2.3 From 5460601de590158b37619f8e18b678aa18da6345 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 12 Oct 2021 15:09:01 +0300 Subject: dma-buf: Fix pin callback comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pin callback does not necessarily have to move the memory to system memory, remove the sentence from the comment. Link: https://lore.kernel.org/r/20211012120903.96933-2-galpress@amazon.com Signed-off-by: Gal Pressman Reviewed-by: Christian König Signed-off-by: Jason Gunthorpe --- include/linux/dma-buf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 8b32b4bdd590..8afe182fe8f2 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -86,8 +86,8 @@ struct dma_buf_ops { * @pin: * * This is called by dma_buf_pin() and lets the exporter know that the - * DMA-buf can't be moved any more. The exporter should pin the buffer - * into system memory to make sure it is generally accessible by other + * DMA-buf can't be moved any more. Ideally, the exporter should + * pin the buffer so that it is generally accessible by all * devices. * * This is called with the &dmabuf.resv object locked and is mutual -- cgit v1.2.3 From 3ab7b6ac5d829e60c3b89d415811ff1c9f358c8e Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 25 Oct 2021 10:09:23 -0700 Subject: pwm: Introduce single-PWM of_xlate function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing pxa driver and the upcoming addition of PWM support in the TI sn565dsi86 DSI/eDP bridge driver both has a single PWM channel and thereby a need for a of_xlate function with the period as its single argument. Introduce a common helper function in the core that can be used as of_xlate by such drivers and migrate the pxa driver to use this. Signed-off-by: Bjorn Andersson Acked-by: Uwe Kleine-König Tested-by: Steev Klimaszewski Tested-By: Steev Klimaszewski Signed-off-by: Robert Foss Link: https://patchwork.freedesktop.org/patch/msgid/20211025170925.3096444-1-bjorn.andersson@linaro.org --- include/linux/pwm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 725c9b784e60..dd51d4931fdc 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -414,6 +414,8 @@ struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip, struct pwm_device *of_pwm_xlate_with_flags(struct pwm_chip *pc, const struct of_phandle_args *args); +struct pwm_device *of_pwm_single_xlate(struct pwm_chip *pc, + const struct of_phandle_args *args); struct pwm_device *pwm_get(struct device *dev, const char *con_id); struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np, -- cgit v1.2.3 From ce5e48036c9e76a2a5bd4d9079eac273087a533a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B4=87?= Date: Wed, 27 Oct 2021 11:14:44 +0800 Subject: ftrace: disable preemption when recursion locked As the documentation explained, ftrace_test_recursion_trylock() and ftrace_test_recursion_unlock() were supposed to disable and enable preemption properly, however currently this work is done outside of the function, which could be missing by mistake. And since the internal using of trace_test_and_set_recursion() and trace_clear_recursion() also require preemption disabled, we can just merge the logical. This patch will make sure the preemption has been disabled when trace_test_and_set_recursion() return bit >= 0, and trace_clear_recursion() will enable the preemption if previously enabled. Link: https://lkml.kernel.org/r/13bde807-779c-aa4c-0672-20515ae365ea@linux.alibaba.com CC: Petr Mladek Cc: Guo Ren Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Joe Lawrence Cc: Masami Hiramatsu Cc: Nicholas Piggin Cc: Jisheng Zhang CC: Steven Rostedt CC: Miroslav Benes Reported-by: Abaci Suggested-by: Peter Zijlstra Signed-off-by: Michael Wang [ Removed extra line in comment - SDR ] Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_recursion.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 24f284eb55a7..a13f23b04d73 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -155,6 +155,9 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); # define do_ftrace_record_recursion(ip, pip) do { } while (0) #endif +/* + * Preemption is promised to be disabled when return bit >= 0. + */ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip, int start, int max) { @@ -189,14 +192,20 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign current->trace_recursion = val; barrier(); + preempt_disable_notrace(); + return bit + 1; } +/* + * Preemption will be enabled (if it was previously enabled). + */ static __always_inline void trace_clear_recursion(int bit) { if (!bit) return; + preempt_enable_notrace(); barrier(); bit--; trace_recursion_clear(bit); @@ -209,7 +218,7 @@ static __always_inline void trace_clear_recursion(int bit) * tracing recursed in the same context (normal vs interrupt), * * Returns: -1 if a recursion happened. - * >= 0 if no recursion + * >= 0 if no recursion. */ static __always_inline int ftrace_test_recursion_trylock(unsigned long ip, unsigned long parent_ip) -- cgit v1.2.3 From e531e90b5ab0f7ce5ff298e165214c1aec6ed187 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Mon, 30 Aug 2021 21:37:23 -0700 Subject: tracing: Increase PERF_MAX_TRACE_SIZE to handle Sentinel1 and docker together Running endpoint security solutions like Sentinel1 that use perf-based tracing heavily lead to this repeated dump complaining about dockerd. The default value of 2048 is nowhere near not large enough. Using the prior patch "tracing: show size of requested buffer", we get "perf buffer not large enough, wanted 6644, have 6144", after repeated up-sizing (I did 2/4/6/8K). With 8K, the problem doesn't occur at all, so below is the trace for 6K. I'm wondering if this value should be selectable at boot time, but this is a good starting point. ``` ------------[ cut here ]------------ perf buffer not large enough, wanted 6644, have 6144 WARNING: CPU: 1 PID: 4997 at kernel/trace/trace_event_perf.c:402 perf_trace_buf_alloc+0x8c/0xa0 Modules linked in: [..] CPU: 1 PID: 4997 Comm: sh Tainted: G T 5.13.13-x86_64-00039-gb3959163488e #63 Hardware name: LENOVO 20KH002JUS/20KH002JUS, BIOS N23ET66W (1.41 ) 09/02/2019 RIP: 0010:perf_trace_buf_alloc+0x8c/0xa0 Code: 80 3d 43 97 d0 01 00 74 07 31 c0 5b 5d 41 5c c3 ba 00 18 00 00 89 ee 48 c7 c7 00 82 7d 91 c6 05 25 97 d0 01 01 e8 22 ee bc 00 <0f> 0b 31 c0 eb db 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 55 89 RSP: 0018:ffffb922026b7d58 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff9da5ee012000 RCX: 0000000000000027 RDX: ffff9da881657828 RSI: 0000000000000001 RDI: ffff9da881657820 RBP: 00000000000019f4 R08: 0000000000000000 R09: ffffb922026b7b80 R10: ffffb922026b7b78 R11: ffffffff91dda688 R12: 000000000000000f R13: ffff9da5ee012108 R14: ffff9da8816570a0 R15: ffffb922026b7e30 FS: 00007f420db1a080(0000) GS:ffff9da881640000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000060 CR3: 00000002504a8006 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: kprobe_perf_func+0x11e/0x270 ? do_execveat_common.isra.0+0x1/0x1c0 ? do_execveat_common.isra.0+0x5/0x1c0 kprobe_ftrace_handler+0x10e/0x1d0 0xffffffffc03aa0c8 ? do_execveat_common.isra.0+0x1/0x1c0 do_execveat_common.isra.0+0x5/0x1c0 __x64_sys_execve+0x33/0x40 do_syscall_64+0x6b/0xc0 ? do_syscall_64+0x11/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f420dc1db37 Code: ff ff 76 e7 f7 d8 64 41 89 00 eb df 0f 1f 80 00 00 00 00 f7 d8 64 41 89 00 eb dc 0f 1f 84 00 00 00 00 00 b8 3b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 01 43 0f 00 f7 d8 64 89 01 48 RSP: 002b:00007ffd4e8b4e38 EFLAGS: 00000246 ORIG_RAX: 000000000000003b RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f420dc1db37 RDX: 0000564338d1e740 RSI: 0000564338d32d50 RDI: 0000564338d28f00 RBP: 0000564338d28f00 R08: 0000564338d32d50 R09: 0000000000000020 R10: 00000000000001b6 R11: 0000000000000246 R12: 0000564338d28f00 R13: 0000564338d32d50 R14: 0000564338d1e740 R15: 0000564338d28c60 ---[ end trace 83ab3e8e16275e49 ]--- ``` Link: https://lkml.kernel.org/r/20210831043723.13481-2-robbat2@gentoo.org Signed-off-by: Robin H. Johnson Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 3e475eeb5a99..50453b287615 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -671,7 +671,7 @@ struct trace_event_file { } \ early_initcall(trace_init_perf_perm_##name); -#define PERF_MAX_TRACE_SIZE 2048 +#define PERF_MAX_TRACE_SIZE 8192 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ -- cgit v1.2.3 From f941eadd8d6d4ee2f8c9aeab8e1da5e647533a7d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2021 14:41:31 -0700 Subject: bpf: Avoid races in __bpf_prog_run() for 32bit arches __bpf_prog_run() can run from non IRQ contexts, meaning it could be re entered if interrupted. This calls for the irq safe variant of u64_stats_update_{begin|end}, or risk a deadlock. This patch is a nop on 64bit arches, fortunately. syzbot report: WARNING: inconsistent lock state 5.12.0-rc3-syzkaller #0 Not tainted -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. udevd/4013 [HC0[0]:SC0[0]:HE1:SE1] takes: ff7c9dec (&(&pstats->syncp)->seq){+.?.}-{0:0}, at: sk_filter include/linux/filter.h:867 [inline] ff7c9dec (&(&pstats->syncp)->seq){+.?.}-{0:0}, at: do_one_broadcast net/netlink/af_netlink.c:1468 [inline] ff7c9dec (&(&pstats->syncp)->seq){+.?.}-{0:0}, at: netlink_broadcast_filtered+0x27c/0x4fc net/netlink/af_netlink.c:1520 {IN-SOFTIRQ-W} state was registered at: lock_acquire.part.0+0xf0/0x41c kernel/locking/lockdep.c:5510 lock_acquire+0x6c/0x74 kernel/locking/lockdep.c:5483 do_write_seqcount_begin_nested include/linux/seqlock.h:520 [inline] do_write_seqcount_begin include/linux/seqlock.h:545 [inline] u64_stats_update_begin include/linux/u64_stats_sync.h:129 [inline] bpf_prog_run_pin_on_cpu include/linux/filter.h:624 [inline] bpf_prog_run_clear_cb+0x1bc/0x270 include/linux/filter.h:755 run_filter+0xa0/0x17c net/packet/af_packet.c:2031 packet_rcv+0xc0/0x3e0 net/packet/af_packet.c:2104 dev_queue_xmit_nit+0x2bc/0x39c net/core/dev.c:2387 xmit_one net/core/dev.c:3588 [inline] dev_hard_start_xmit+0x94/0x518 net/core/dev.c:3609 sch_direct_xmit+0x11c/0x1f0 net/sched/sch_generic.c:313 qdisc_restart net/sched/sch_generic.c:376 [inline] __qdisc_run+0x194/0x7f8 net/sched/sch_generic.c:384 qdisc_run include/net/pkt_sched.h:136 [inline] qdisc_run include/net/pkt_sched.h:128 [inline] __dev_xmit_skb net/core/dev.c:3795 [inline] __dev_queue_xmit+0x65c/0xf84 net/core/dev.c:4150 dev_queue_xmit+0x14/0x18 net/core/dev.c:4215 neigh_resolve_output net/core/neighbour.c:1491 [inline] neigh_resolve_output+0x170/0x228 net/core/neighbour.c:1471 neigh_output include/net/neighbour.h:510 [inline] ip6_finish_output2+0x2e4/0x9fc net/ipv6/ip6_output.c:117 __ip6_finish_output net/ipv6/ip6_output.c:182 [inline] __ip6_finish_output+0x164/0x3f8 net/ipv6/ip6_output.c:161 ip6_finish_output+0x2c/0xb0 net/ipv6/ip6_output.c:192 NF_HOOK_COND include/linux/netfilter.h:290 [inline] ip6_output+0x74/0x294 net/ipv6/ip6_output.c:215 dst_output include/net/dst.h:448 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] NF_HOOK include/linux/netfilter.h:295 [inline] mld_sendpack+0x2a8/0x7e4 net/ipv6/mcast.c:1679 mld_send_cr net/ipv6/mcast.c:1975 [inline] mld_ifc_timer_expire+0x1e8/0x494 net/ipv6/mcast.c:2474 call_timer_fn+0xd0/0x570 kernel/time/timer.c:1431 expire_timers kernel/time/timer.c:1476 [inline] __run_timers kernel/time/timer.c:1745 [inline] run_timer_softirq+0x2e4/0x384 kernel/time/timer.c:1758 __do_softirq+0x204/0x7ac kernel/softirq.c:345 do_softirq_own_stack include/asm-generic/softirq_stack.h:10 [inline] invoke_softirq kernel/softirq.c:228 [inline] __irq_exit_rcu+0x1d8/0x200 kernel/softirq.c:422 irq_exit+0x10/0x3c kernel/softirq.c:446 __handle_domain_irq+0xb4/0x120 kernel/irq/irqdesc.c:692 handle_domain_irq include/linux/irqdesc.h:176 [inline] gic_handle_irq+0x84/0xac drivers/irqchip/irq-gic.c:370 __irq_svc+0x5c/0x94 arch/arm/kernel/entry-armv.S:205 debug_smp_processor_id+0x0/0x24 lib/smp_processor_id.c:53 rcu_read_lock_held_common kernel/rcu/update.c:108 [inline] rcu_read_lock_sched_held+0x24/0x7c kernel/rcu/update.c:123 trace_lock_acquire+0x24c/0x278 include/trace/events/lock.h:13 lock_acquire+0x3c/0x74 kernel/locking/lockdep.c:5481 rcu_lock_acquire include/linux/rcupdate.h:267 [inline] rcu_read_lock include/linux/rcupdate.h:656 [inline] avc_has_perm_noaudit+0x6c/0x260 security/selinux/avc.c:1150 selinux_inode_permission+0x140/0x220 security/selinux/hooks.c:3141 security_inode_permission+0x44/0x60 security/security.c:1268 inode_permission.part.0+0x5c/0x13c fs/namei.c:521 inode_permission fs/namei.c:494 [inline] may_lookup fs/namei.c:1652 [inline] link_path_walk.part.0+0xd4/0x38c fs/namei.c:2208 link_path_walk fs/namei.c:2189 [inline] path_lookupat+0x3c/0x1b8 fs/namei.c:2419 filename_lookup+0xa8/0x1a4 fs/namei.c:2453 user_path_at_empty+0x74/0x90 fs/namei.c:2733 do_readlinkat+0x5c/0x12c fs/stat.c:417 __do_sys_readlink fs/stat.c:450 [inline] sys_readlink+0x24/0x28 fs/stat.c:447 ret_fast_syscall+0x0/0x2c arch/arm/mm/proc-v7.S:64 0x7eaa4974 irq event stamp: 298277 hardirqs last enabled at (298277): [<802000d0>] no_work_pending+0x4/0x34 hardirqs last disabled at (298276): [<8020c9b8>] do_work_pending+0x9c/0x648 arch/arm/kernel/signal.c:676 softirqs last enabled at (298216): [<8020167c>] __do_softirq+0x584/0x7ac kernel/softirq.c:372 softirqs last disabled at (298201): [<8024dff4>] do_softirq_own_stack include/asm-generic/softirq_stack.h:10 [inline] softirqs last disabled at (298201): [<8024dff4>] invoke_softirq kernel/softirq.c:228 [inline] softirqs last disabled at (298201): [<8024dff4>] __irq_exit_rcu+0x1d8/0x200 kernel/softirq.c:422 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&pstats->syncp)->seq); lock(&(&pstats->syncp)->seq); *** DEADLOCK *** 1 lock held by udevd/4013: #0: 82b09c5c (rcu_read_lock){....}-{1:2}, at: sk_filter_trim_cap+0x54/0x434 net/core/filter.c:139 stack backtrace: CPU: 1 PID: 4013 Comm: udevd Not tainted 5.12.0-rc3-syzkaller #0 Hardware name: ARM-Versatile Express Backtrace: [<81802550>] (dump_backtrace) from [<818027c4>] (show_stack+0x18/0x1c arch/arm/kernel/traps.c:252) r7:00000080 r6:600d0093 r5:00000000 r4:82b58344 [<818027ac>] (show_stack) from [<81809e98>] (__dump_stack lib/dump_stack.c:79 [inline]) [<818027ac>] (show_stack) from [<81809e98>] (dump_stack+0xb8/0xe8 lib/dump_stack.c:120) [<81809de0>] (dump_stack) from [<81804a00>] (print_usage_bug.part.0+0x228/0x230 kernel/locking/lockdep.c:3806) r7:86bcb768 r6:81a0326c r5:830f96a8 r4:86bcb0c0 [<818047d8>] (print_usage_bug.part.0) from [<802bb1b8>] (print_usage_bug kernel/locking/lockdep.c:3776 [inline]) [<818047d8>] (print_usage_bug.part.0) from [<802bb1b8>] (valid_state kernel/locking/lockdep.c:3818 [inline]) [<818047d8>] (print_usage_bug.part.0) from [<802bb1b8>] (mark_lock_irq kernel/locking/lockdep.c:4021 [inline]) [<818047d8>] (print_usage_bug.part.0) from [<802bb1b8>] (mark_lock.part.0+0xc34/0x136c kernel/locking/lockdep.c:4478) r10:83278fe8 r9:82c6d748 r8:00000000 r7:82c6d2d4 r6:00000004 r5:86bcb768 r4:00000006 [<802ba584>] (mark_lock.part.0) from [<802bc644>] (mark_lock kernel/locking/lockdep.c:4442 [inline]) [<802ba584>] (mark_lock.part.0) from [<802bc644>] (mark_usage kernel/locking/lockdep.c:4391 [inline]) [<802ba584>] (mark_lock.part.0) from [<802bc644>] (__lock_acquire+0x9bc/0x3318 kernel/locking/lockdep.c:4854) r10:86bcb768 r9:86bcb0c0 r8:00000001 r7:00040000 r6:0000075a r5:830f96a8 r4:00000000 [<802bbc88>] (__lock_acquire) from [<802bfb90>] (lock_acquire.part.0+0xf0/0x41c kernel/locking/lockdep.c:5510) r10:00000000 r9:600d0013 r8:00000000 r7:00000000 r6:828a2680 r5:828a2680 r4:861e5bc8 [<802bfaa0>] (lock_acquire.part.0) from [<802bff28>] (lock_acquire+0x6c/0x74 kernel/locking/lockdep.c:5483) r10:8146137c r9:00000000 r8:00000001 r7:00000000 r6:00000000 r5:00000000 r4:ff7c9dec [<802bfebc>] (lock_acquire) from [<81381eb4>] (do_write_seqcount_begin_nested include/linux/seqlock.h:520 [inline]) [<802bfebc>] (lock_acquire) from [<81381eb4>] (do_write_seqcount_begin include/linux/seqlock.h:545 [inline]) [<802bfebc>] (lock_acquire) from [<81381eb4>] (u64_stats_update_begin include/linux/u64_stats_sync.h:129 [inline]) [<802bfebc>] (lock_acquire) from [<81381eb4>] (__bpf_prog_run_save_cb include/linux/filter.h:727 [inline]) [<802bfebc>] (lock_acquire) from [<81381eb4>] (bpf_prog_run_save_cb include/linux/filter.h:741 [inline]) [<802bfebc>] (lock_acquire) from [<81381eb4>] (sk_filter_trim_cap+0x26c/0x434 net/core/filter.c:149) r10:a4095dd0 r9:ff7c9dd0 r8:e44be000 r7:8146137c r6:00000001 r5:8611ba80 r4:00000000 [<81381c48>] (sk_filter_trim_cap) from [<8146137c>] (sk_filter include/linux/filter.h:867 [inline]) [<81381c48>] (sk_filter_trim_cap) from [<8146137c>] (do_one_broadcast net/netlink/af_netlink.c:1468 [inline]) [<81381c48>] (sk_filter_trim_cap) from [<8146137c>] (netlink_broadcast_filtered+0x27c/0x4fc net/netlink/af_netlink.c:1520) r10:00000001 r9:833d6b1c r8:00000000 r7:8572f864 r6:8611ba80 r5:8698d800 r4:8572f800 [<81461100>] (netlink_broadcast_filtered) from [<81463e60>] (netlink_broadcast net/netlink/af_netlink.c:1544 [inline]) [<81461100>] (netlink_broadcast_filtered) from [<81463e60>] (netlink_sendmsg+0x3d0/0x478 net/netlink/af_netlink.c:1925) r10:00000000 r9:00000002 r8:8698d800 r7:000000b7 r6:8611b900 r5:861e5f50 r4:86aa3000 [<81463a90>] (netlink_sendmsg) from [<81321f54>] (sock_sendmsg_nosec net/socket.c:654 [inline]) [<81463a90>] (netlink_sendmsg) from [<81321f54>] (sock_sendmsg+0x3c/0x4c net/socket.c:674) r10:00000000 r9:861e5dd4 r8:00000000 r7:86570000 r6:00000000 r5:86570000 r4:861e5f50 [<81321f18>] (sock_sendmsg) from [<813234d0>] (____sys_sendmsg+0x230/0x29c net/socket.c:2350) r5:00000040 r4:861e5f50 [<813232a0>] (____sys_sendmsg) from [<8132549c>] (___sys_sendmsg+0xac/0xe4 net/socket.c:2404) r10:00000128 r9:861e4000 r8:00000000 r7:00000000 r6:86570000 r5:861e5f50 r4:00000000 [<813253f0>] (___sys_sendmsg) from [<81325684>] (__sys_sendmsg net/socket.c:2433 [inline]) [<813253f0>] (___sys_sendmsg) from [<81325684>] (__do_sys_sendmsg net/socket.c:2442 [inline]) [<813253f0>] (___sys_sendmsg) from [<81325684>] (sys_sendmsg+0x58/0xa0 net/socket.c:2440) r8:80200224 r7:00000128 r6:00000000 r5:7eaa541c r4:86570000 [<8132562c>] (sys_sendmsg) from [<80200060>] (ret_fast_syscall+0x0/0x2c arch/arm/mm/proc-v7.S:64) Exception stack(0x861e5fa8 to 0x861e5ff0) 5fa0: 00000000 00000000 0000000c 7eaa541c 00000000 00000000 5fc0: 00000000 00000000 76fbf840 00000128 00000000 0000008f 7eaa541c 000563f8 5fe0: 00056110 7eaa53e0 00036cec 76c9bf44 r6:76fbf840 r5:00000000 r4:00000000 Fixes: 492ecee892c2 ("bpf: enable program stats") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211026214133.3114279-2-eric.dumazet@gmail.com --- include/linux/filter.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 47f80adbe744..2fffe9cc50f9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -612,13 +612,14 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; u64 start = sched_clock(); + unsigned long flags; ret = dfunc(ctx, prog->insnsi, prog->bpf_func); stats = this_cpu_ptr(prog->stats); - u64_stats_update_begin(&stats->syncp); + flags = u64_stats_update_begin_irqsave(&stats->syncp); stats->cnt++; stats->nsecs += sched_clock() - start; - u64_stats_update_end(&stats->syncp); + u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } -- cgit v1.2.3 From 61a0abaee2092eee69e44fe60336aa2f5b578938 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2021 14:41:33 -0700 Subject: bpf: Use u64_stats_t in struct bpf_prog_stats Commit 316580b69d0a ("u64_stats: provide u64_stats_t type") fixed possible load/store tearing on 64bit arches. For instance the following C code stats->nsecs += sched_clock() - start; Could be rightfully implemented like this by a compiler, confusing concurrent readers a lot: stats->nsecs += sched_clock(); // arbitrary delay stats->nsecs -= start; Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211026214133.3114279-4-eric.dumazet@gmail.com --- include/linux/filter.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 2fffe9cc50f9..9782e3245852 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -553,9 +553,9 @@ struct bpf_binary_header { }; struct bpf_prog_stats { - u64 cnt; - u64 nsecs; - u64 misses; + u64_stats_t cnt; + u64_stats_t nsecs; + u64_stats_t misses; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); @@ -617,8 +617,8 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, ret = dfunc(ctx, prog->insnsi, prog->bpf_func); stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); - stats->cnt++; - stats->nsecs += sched_clock() - start; + u64_stats_inc(&stats->cnt); + u64_stats_add(&stats->nsecs, sched_clock() - start); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); -- cgit v1.2.3 From 259714100d98b50bf04d36a21bf50ca8b829fc11 Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Mon, 25 Oct 2021 15:01:53 +0800 Subject: PM / wakeirq: support enabling wake-up irq after runtime_suspend called When the dedicated wake IRQ is level trigger, and it uses the device's low-power status as the wakeup source, that means if the device is not in low-power state, the wake IRQ will be triggered if enabled; For this case, need enable the wake IRQ after running the device's ->runtime_suspend() which make it enter low-power state. e.g. Assume the wake IRQ is a low level trigger type, and the wakeup signal comes from the low-power status of the device. The wakeup signal is low level at running time (0), and becomes high level when the device enters low-power state (runtime_suspend (1) is called), a wakeup event at (2) make the device exit low-power state, then the wakeup signal also becomes low level. ------------------ | ^ ^| ---------------- | | -------------- |<---(0)--->|<--(1)--| (3) (2) (4) if enable the wake IRQ before running runtime_suspend during (0), a wake IRQ will arise, it causes resume immediately; it works if enable wake IRQ ( e.g. at (3) or (4)) after running ->runtime_suspend(). This patch introduces a new status WAKE_IRQ_DEDICATED_REVERSE to optionally support enabling wake IRQ after running ->runtime_suspend(). Suggested-by: Rafael J. Wysocki Signed-off-by: Chunfeng Yun Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeirq.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_wakeirq.h b/include/linux/pm_wakeirq.h index cd5b62db9084..e63a63aa47a3 100644 --- a/include/linux/pm_wakeirq.h +++ b/include/linux/pm_wakeirq.h @@ -17,8 +17,8 @@ #ifdef CONFIG_PM extern int dev_pm_set_wake_irq(struct device *dev, int irq); -extern int dev_pm_set_dedicated_wake_irq(struct device *dev, - int irq); +extern int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq); +extern int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq); extern void dev_pm_clear_wake_irq(struct device *dev); extern void dev_pm_enable_wake_irq(struct device *dev); extern void dev_pm_disable_wake_irq(struct device *dev); @@ -35,6 +35,11 @@ static inline int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq) return 0; } +static inline int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq) +{ + return 0; +} + static inline void dev_pm_clear_wake_irq(struct device *dev) { } -- cgit v1.2.3 From 570b1cac477643cbf01a45fa5d018430a1fddbce Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Tue, 26 Oct 2021 22:40:12 +0800 Subject: block: Add a helper to validate the block size There are some duplicated codes to validate the block size in block drivers. This limitation actually comes from block layer, so this patch tries to add a new block layer helper for that. Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20211026144015.188-2-xieyongji@bytedance.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6d95a4b36cfa..d2d627e2c782 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -44,6 +44,14 @@ struct blk_crypto_profile; */ #define BLKCG_MAX_POLS 6 +static inline int blk_validate_block_size(unsigned int bsize) +{ + if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) + return -EINVAL; + + return 0; +} + static inline bool blk_op_is_passthrough(unsigned int op) { op &= REQ_OP_MASK; -- cgit v1.2.3 From 9dfc685e0262d4c5e44e13302f89841fa75173ca Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2021 14:30:14 -0700 Subject: inet: remove races in inet{6}_getname() syzbot reported data-races in inet_getname() multiple times, it is time we fix this instead of pretending applications should not trigger them. getsockname() and getpeername() are not really considered fast path. v2: added the missing BPF_CGROUP_RUN_SA_PROG() declaration needed when CONFIG_CGROUP_BPF=n, as reported by kernel test robot syzbot typical report: BUG: KCSAN: data-race in __inet_hash_connect / inet_getname write to 0xffff888136d66cf8 of 2 bytes by task 14374 on cpu 1: __inet_hash_connect+0x7ec/0x950 net/ipv4/inet_hashtables.c:831 inet_hash_connect+0x85/0x90 net/ipv4/inet_hashtables.c:853 tcp_v4_connect+0x782/0xbb0 net/ipv4/tcp_ipv4.c:275 __inet_stream_connect+0x156/0x6e0 net/ipv4/af_inet.c:664 inet_stream_connect+0x44/0x70 net/ipv4/af_inet.c:728 __sys_connect_file net/socket.c:1896 [inline] __sys_connect+0x254/0x290 net/socket.c:1913 __do_sys_connect net/socket.c:1923 [inline] __se_sys_connect net/socket.c:1920 [inline] __x64_sys_connect+0x3d/0x50 net/socket.c:1920 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xa0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888136d66cf8 of 2 bytes by task 14408 on cpu 0: inet_getname+0x11f/0x170 net/ipv4/af_inet.c:790 __sys_getsockname+0x11d/0x1b0 net/socket.c:1946 __do_sys_getsockname net/socket.c:1961 [inline] __se_sys_getsockname net/socket.c:1958 [inline] __x64_sys_getsockname+0x3e/0x50 net/socket.c:1958 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xa0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000 -> 0xdee0 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 14408 Comm: syz-executor.3 Not tainted 5.15.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20211026213014.3026708-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/bpf-cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2746fd804216..3536ab432b30 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -517,6 +517,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define cgroup_bpf_enabled(atype) (0) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; }) +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) -- cgit v1.2.3 From d4dccf353db80e209f262e3973c834e6e48ba9a9 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Mon, 25 Oct 2021 08:21:09 -0400 Subject: Drivers: hv: vmbus: Mark vmbus ring buffer visible to host in Isolation VM Mark vmbus ring buffer visible with set_memory_decrypted() when establish gpadl handle. Reviewed-by: Michael Kelley Signed-off-by: Tianyu Lan Link: https://lore.kernel.org/r/20211025122116.264793-5-ltykernel@gmail.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index ddc8713ce57b..a9e0bc3b1511 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -803,6 +803,12 @@ struct vmbus_device { #define VMBUS_DEFAULT_MAX_PKT_SIZE 4096 +struct vmbus_gpadl { + u32 gpadl_handle; + u32 size; + void *buffer; +}; + struct vmbus_channel { struct list_head listentry; @@ -822,7 +828,7 @@ struct vmbus_channel { bool rescind_ref; /* got rescind msg, got channel reference */ struct completion rescind_event; - u32 ringbuffer_gpadlhandle; + struct vmbus_gpadl ringbuffer_gpadlhandle; /* Allocated memory for ring buffer */ struct page *ringbuffer_page; @@ -1192,10 +1198,10 @@ extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, extern int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer, u32 size, - u32 *gpadl_handle); + struct vmbus_gpadl *gpadl); extern int vmbus_teardown_gpadl(struct vmbus_channel *channel, - u32 gpadl_handle); + struct vmbus_gpadl *gpadl); void vmbus_reset_channel_cb(struct vmbus_channel *channel); -- cgit v1.2.3 From 20cf6616ccd50256a14fb2a7a3cc730080c90cd0 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 25 Oct 2021 12:54:34 -0700 Subject: Drivers: hv: vmbus: Remove unused code to check for subchannels The last caller of vmbus_are_subchannels_present() was removed in commit c967590457ca ("scsi: storvsc: Fix a race in sub-channel creation that can cause panic"). Remove this dead code, and the utility function invoke_sc_cb() that it is the only caller of. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/1635191674-34407-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index a9e0bc3b1511..b823311eac79 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1106,19 +1106,6 @@ void vmbus_set_sc_create_callback(struct vmbus_channel *primary_channel, void vmbus_set_chn_rescind_callback(struct vmbus_channel *channel, void (*chn_rescind_cb)(struct vmbus_channel *)); -/* - * Check if sub-channels have already been offerred. This API will be useful - * when the driver is unloaded after establishing sub-channels. In this case, - * when the driver is re-loaded, the driver would have to check if the - * subchannels have already been established before attempting to request - * the creation of sub-channels. - * This function returns TRUE to indicate that subchannels have already been - * created. - * This function should be invoked after setting the callback function for - * sub-channel creation. - */ -bool vmbus_are_subchannels_present(struct vmbus_channel *primary); - /* The format must be the same as struct vmdata_gpa_direct */ struct vmbus_channel_packet_page_buffer { u16 type; -- cgit v1.2.3 From 9330986c03006ab1d33d243b7cfe598a7a3c1baa Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 27 Oct 2021 16:45:00 -0700 Subject: bpf: Add bloom filter map implementation This patch adds the kernel-side changes for the implementation of a bpf bloom filter map. The bloom filter map supports peek (determining whether an element is present in the map) and push (adding an element to the map) operations.These operations are exposed to userspace applications through the already existing syscalls in the following way: BPF_MAP_LOOKUP_ELEM -> peek BPF_MAP_UPDATE_ELEM -> push The bloom filter map does not have keys, only values. In light of this, the bloom filter map's API matches that of queue stack maps: user applications use BPF_MAP_LOOKUP_ELEM/BPF_MAP_UPDATE_ELEM which correspond internally to bpf_map_peek_elem/bpf_map_push_elem, and bpf programs must use the bpf_map_peek_elem and bpf_map_push_elem APIs to query or add an element to the bloom filter map. When the bloom filter map is created, it must be created with a key_size of 0. For updates, the user will pass in the element to add to the map as the value, with a NULL key. For lookups, the user will pass in the element to query in the map as the value, with a NULL key. In the verifier layer, this requires us to modify the argument type of a bloom filter's BPF_FUNC_map_peek_elem call to ARG_PTR_TO_MAP_VALUE; as well, in the syscall layer, we need to copy over the user value so that in bpf_map_peek_elem, we know which specific value to query. A few things to please take note of: * If there are any concurrent lookups + updates, the user is responsible for synchronizing this to ensure no false negative lookups occur. * The number of hashes to use for the bloom filter is configurable from userspace. If no number is specified, the default used will be 5 hash functions. The benchmarks later in this patchset can help compare the performance of using different number of hashes on different entry sizes. In general, using more hashes decreases both the false positive rate and the speed of a lookup. * Deleting an element in the bloom filter map is not supported. * The bloom filter map may be used as an inner map. * The "max_entries" size that is specified at map creation time is used to approximate a reasonable bitmap size for the bloom filter, and is not otherwise strictly enforced. If the user wishes to insert more entries into the bloom filter than "max_entries", they may do so but they should be aware that this may lead to a higher false positive rate. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211027234504.30744-2-joannekoong@fb.com --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 31421c74ba08..50105e0b8fcc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -169,6 +169,7 @@ struct bpf_map { u32 value_size; u32 max_entries; u32 map_flags; + u64 map_extra; /* any per-map-type extra fields */ int spin_lock_off; /* >=0 valid offset, <0 error */ int timer_off; /* >=0 valid offset, <0 error */ u32 id; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9c81724e4b98..c4424ac2fa02 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -125,6 +125,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) -- cgit v1.2.3 From d6aef08a872b9e23eecc92d0e92393473b13c497 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 28 Oct 2021 12:04:54 +0530 Subject: bpf: Add bpf_kallsyms_lookup_name helper This helper allows us to get the address of a kernel symbol from inside a BPF_PROG_TYPE_SYSCALL prog (used by gen_loader), so that we can relocate typeless ksym vars. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211028063501.2239335-2-memxor@gmail.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50105e0b8fcc..6deebf8bf78f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2110,6 +2110,7 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; +extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From eac96c3efdb593df1a57bb5b95dbe037bfa9a522 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 28 Oct 2021 14:36:11 -0700 Subject: mm: filemap: check if THP has hwpoisoned subpage for PMD page fault When handling shmem page fault the THP with corrupted subpage could be PMD mapped if certain conditions are satisfied. But kernel is supposed to send SIGBUS when trying to map hwpoisoned page. There are two paths which may do PMD map: fault around and regular fault. Before commit f9ce0be71d1f ("mm: Cleanup faultaround and finish_fault() codepaths") the thing was even worse in fault around path. The THP could be PMD mapped as long as the VMA fits regardless what subpage is accessed and corrupted. After this commit as long as head page is not corrupted the THP could be PMD mapped. In the regular fault path the THP could be PMD mapped as long as the corrupted page is not accessed and the VMA fits. This loophole could be fixed by iterating every subpage to check if any of them is hwpoisoned or not, but it is somewhat costly in page fault path. So introduce a new page flag called HasHWPoisoned on the first tail page. It indicates the THP has hwpoisoned subpage(s). It is set if any subpage of THP is found hwpoisoned by memory failure and after the refcount is bumped successfully, then cleared when the THP is freed or split. The soft offline path doesn't need this since soft offline handler just marks a subpage hwpoisoned when the subpage is migrated successfully. But shmem THP didn't get split then migrated at all. Link: https://lkml.kernel.org/r/20211020210755.23964-3-shy828301@gmail.com Fixes: 800d8c63b2e9 ("shmem: add huge pages support") Signed-off-by: Yang Shi Reviewed-by: Naoya Horiguchi Suggested-by: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Oscar Salvador Cc: Peter Xu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a558d67ee86f..fbfd3fad48f2 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -171,6 +171,15 @@ enum pageflags { /* Compound pages. Stored in first tail page's flags */ PG_double_map = PG_workingset, +#ifdef CONFIG_MEMORY_FAILURE + /* + * Compound pages. Stored in first tail page's flags. + * Indicates that at least one subpage is hwpoisoned in the + * THP. + */ + PG_has_hwpoisoned = PG_mappedtodisk, +#endif + /* non-lru isolated movable page */ PG_isolated = PG_reclaim, @@ -668,6 +677,20 @@ PAGEFLAG_FALSE(DoubleMap) TESTSCFLAG_FALSE(DoubleMap) #endif +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +/* + * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the + * compound page. + * + * This flag is set by hwpoison handler. Cleared by THP split or free page. + */ +PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) + TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) +#else +PAGEFLAG_FALSE(HasHWPoisoned) + TESTSCFLAG_FALSE(HasHWPoisoned) +#endif + /* * Check if a page is currently marked HWPoisoned. Note that this check is * best effort only and inherently racy: there is no way to synchronize with -- cgit v1.2.3 From 78476d315e190533757ab894255c4f2c2f254bce Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Fri, 29 Oct 2021 11:01:44 +0800 Subject: mctp: Add flow extension to skb This change adds a new skb extension for MCTP, to represent a request/response flow. The intention is to use this in a later change to allow i2c controllers to correctly configure a multiplexer over a flow. Since we have a cleanup function in the core path (if an extension is present), we'll need to make CONFIG_MCTP a bool, rather than a tristate. Includes a fix for a build warning with clang: Reported-by: kernel test robot Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cb96f1e6460c..0bd6520329f6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4243,6 +4243,9 @@ enum skb_ext_id { #endif #if IS_ENABLED(CONFIG_MPTCP) SKB_EXT_MPTCP, +#endif +#if IS_ENABLED(CONFIG_MCTP_FLOWS) + SKB_EXT_MCTP, #endif SKB_EXT_NUM, /* must be last */ }; -- cgit v1.2.3 From 0bf6d96cb8294094ce1e44cbe8cf65b0899d0a3a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 25 Oct 2021 09:05:07 +0200 Subject: block: remove blk_{get,put}_request These are now pointless wrappers around blk_mq_{alloc,free}_request, so remove them. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20211025070517.1548584-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ebc45cf0450b..8682663e7368 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -892,9 +892,6 @@ static inline bool rq_is_sync(struct request *rq) } void blk_rq_init(struct request_queue *q, struct request *rq); -void blk_put_request(struct request *rq); -struct request *blk_get_request(struct request_queue *q, unsigned int op, - blk_mq_req_flags_t flags); int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); -- cgit v1.2.3 From ee12203746e5cec678c7d126c9901548bfec1dd5 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sat, 9 Oct 2021 09:44:39 -0700 Subject: PCI: Add pci_find_dvsec_capability to find designated VSEC Add pci_find_dvsec_capability to locate a Designated Vendor-Specific Extended Capability with the specified Vendor ID and Capability ID. The Designated Vendor-Specific Extended Capability (DVSEC) allows one or more "vendor" specific capabilities that are not tied to the Vendor ID of the PCI component. Where the DVSEC Vendor may be a standards body like CXL. Cc: David E. Box Cc: Jonathan Cameron Cc: Bjorn Helgaas Cc: Dan Williams Cc: linux-pci@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: Andrew Donnellan Cc: Lu Baolu Reviewed-by: Frederic Barrat Signed-off-by: Ben Widawsky Reviewed-by: Andrew Donnellan Acked-by: Bjorn Helgaas Tested-by: Kan Liang Signed-off-by: Kan Liang Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/163379787943.692348.6814373487017444007.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index cd8aa6fce204..c93ccfa4571b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1130,6 +1130,7 @@ u16 pci_find_ext_capability(struct pci_dev *dev, int cap); u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 pos, int cap); struct pci_bus *pci_find_next_bus(const struct pci_bus *from); u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap); +u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec); u64 pci_get_dsn(struct pci_dev *dev); -- cgit v1.2.3 From 26d5badbccddcc063dc5174a2baffd13a23322aa Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 20 Oct 2021 12:43:59 -0500 Subject: signal: Implement force_fatal_sig Add a simple helper force_fatal_sig that causes a signal to be delivered to a process as if the signal handler was set to SIG_DFL. Reimplement force_sigsegv based upon this new helper. This fixes force_sigsegv so that when it forces the default signal handler to be used the code now forces the signal to be unblocked as well. Reusing the tested logic in force_sig_info_to_task that was built for force_sig_seccomp this makes the implementation trivial. This is interesting both because it makes force_sigsegv simpler and because there are a couple of buggy places in the kernel that call do_exit(SIGILL) or do_exit(SIGSYS) because there is no straight forward way today for those places to simply force the exit of a process with the chosen signal. Creating force_fatal_sig allows those places to be implemented with normal signal exits. Link: https://lkml.kernel.org/r/20211020174406.17889-13-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- include/linux/sched/signal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index e5f4ce622ee6..e2dc9f119ada 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -338,6 +338,7 @@ extern int kill_pid(struct pid *pid, int sig, int priv); extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); +extern void force_fatal_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); -- cgit v1.2.3 From 5bf84b29938579a9350a4a9c2c4f8b5da2aa6992 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 7 May 2021 15:09:10 -0700 Subject: virtchnl: Remove unused VIRTCHNL_VF_OFFLOAD_RSVD define Remove unused define that is currently marked as reserved. This will open up space for a new feature if/when it's introduced. Also, there is no reason to keep unused defines around. Suggested-by: Tony Nguyen Signed-off-by: Brett Creeley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index db0e099c2399..2e1e1379b569 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -240,7 +240,6 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); */ #define VIRTCHNL_VF_OFFLOAD_L2 0x00000001 #define VIRTCHNL_VF_OFFLOAD_IWARP 0x00000002 -#define VIRTCHNL_VF_OFFLOAD_RSVD 0x00000004 #define VIRTCHNL_VF_OFFLOAD_RSS_AQ 0x00000008 #define VIRTCHNL_VF_OFFLOAD_RSS_REG 0x00000010 #define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR 0x00000020 -- cgit v1.2.3 From 4a15022f82ee0f2e14a7f953b7bbc0f5c983b663 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 7 May 2021 15:09:11 -0700 Subject: virtchnl: Use the BIT() macro for capability/offload flags Currently raw hex values are used to define specific bits for each capability/offload in virtchnl.h. Using raw hex values makes it unclear which bits are used/available. Fix this by using the BIT() macro so it's immediately obvious which bits are used/available. Also, move the VIRTCHNL_VF_CAP_ADV_LINK_SPEED define in the correct place to line up with the other bit values and add a comment for its purpose. Signed-off-by: Brett Creeley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 2e1e1379b569..b30a1bc74fc7 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -238,26 +238,26 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); * VIRTCHNL_VF_OFFLOAD_L2 flag is inclusive of base mode L2 offloads including * TX/RX Checksum offloading and TSO for non-tunnelled packets. */ -#define VIRTCHNL_VF_OFFLOAD_L2 0x00000001 -#define VIRTCHNL_VF_OFFLOAD_IWARP 0x00000002 -#define VIRTCHNL_VF_OFFLOAD_RSS_AQ 0x00000008 -#define VIRTCHNL_VF_OFFLOAD_RSS_REG 0x00000010 -#define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR 0x00000020 -#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES 0x00000040 -#define VIRTCHNL_VF_OFFLOAD_VLAN 0x00010000 -#define VIRTCHNL_VF_OFFLOAD_RX_POLLING 0x00020000 -#define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2 0x00040000 -#define VIRTCHNL_VF_OFFLOAD_RSS_PF 0X00080000 -#define VIRTCHNL_VF_OFFLOAD_ENCAP 0X00100000 -#define VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM 0X00200000 -#define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM 0X00400000 -#define VIRTCHNL_VF_OFFLOAD_ADQ 0X00800000 -#define VIRTCHNL_VF_OFFLOAD_USO 0X02000000 -#define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF 0X08000000 -#define VIRTCHNL_VF_OFFLOAD_FDIR_PF 0X10000000 - -/* Define below the capability flags that are not offloads */ -#define VIRTCHNL_VF_CAP_ADV_LINK_SPEED 0x00000080 +#define VIRTCHNL_VF_OFFLOAD_L2 BIT(0) +#define VIRTCHNL_VF_OFFLOAD_IWARP BIT(1) +#define VIRTCHNL_VF_OFFLOAD_RSS_AQ BIT(3) +#define VIRTCHNL_VF_OFFLOAD_RSS_REG BIT(4) +#define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR BIT(5) +#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES BIT(6) +/* used to negotiate communicating link speeds in Mbps */ +#define VIRTCHNL_VF_CAP_ADV_LINK_SPEED BIT(7) +#define VIRTCHNL_VF_OFFLOAD_VLAN BIT(16) +#define VIRTCHNL_VF_OFFLOAD_RX_POLLING BIT(17) +#define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2 BIT(18) +#define VIRTCHNL_VF_OFFLOAD_RSS_PF BIT(19) +#define VIRTCHNL_VF_OFFLOAD_ENCAP BIT(20) +#define VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM BIT(21) +#define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM BIT(22) +#define VIRTCHNL_VF_OFFLOAD_ADQ BIT(23) +#define VIRTCHNL_VF_OFFLOAD_USO BIT(25) +#define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF BIT(27) +#define VIRTCHNL_VF_OFFLOAD_FDIR_PF BIT(28) + #define VF_BASE_MODE_OFFLOADS (VIRTCHNL_VF_OFFLOAD_L2 | \ VIRTCHNL_VF_OFFLOAD_VLAN | \ VIRTCHNL_VF_OFFLOAD_RSS_PF) -- cgit v1.2.3 From 504e15724893a839213fad5eedfbd511d9ba75cc Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Sun, 11 Jul 2021 16:56:54 +0300 Subject: net/mlx5: Allow skipping counter refresh on creation CT creates a counter for each CT rule, and for each such counter, fs_counters tries to queue mlx5_fc_stats_work() work again via mod_delayed_work(0) call to refresh all counters. This call has a large performance impact when reaching high insertion rate and accounts for ~8% of the insertion time when using software steering. Allow skipping the refresh of all counters during counter creation. Change CT to use this refresh skipping for it's counters. Signed-off-by: Paul Blakey Reviewed-by: Roi Dayan Reviewed-by: Oz Shlomo Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index a7e1155bc4da..cd2d4c572367 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -245,6 +245,10 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler, struct mlx5_flow_destination *old_dest); struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); + +/* As mlx5_fc_create() but doesn't queue stats refresh thread. */ +struct mlx5_fc *mlx5_fc_create_ex(struct mlx5_core_dev *dev, bool aging); + void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, -- cgit v1.2.3 From f89f9c56e7372b2dda144f83dce61311b298c559 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Mon, 25 Oct 2021 08:22:04 +0200 Subject: mailbox: apple: Add driver for Apple mailboxes Apple SoCs such as the M1 come with various co-processors. Mailboxes are used to communicate with those. This driver adds support for two variants of those mailboxes. Signed-off-by: Sven Peter Signed-off-by: Jassi Brar --- include/linux/apple-mailbox.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/apple-mailbox.h (limited to 'include/linux') diff --git a/include/linux/apple-mailbox.h b/include/linux/apple-mailbox.h new file mode 100644 index 000000000000..720fbb70294a --- /dev/null +++ b/include/linux/apple-mailbox.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR MIT */ +/* + * Apple mailbox message format + * + * Copyright (C) 2021 The Asahi Linux Contributors + */ + +#ifndef _LINUX_APPLE_MAILBOX_H_ +#define _LINUX_APPLE_MAILBOX_H_ + +#include + +/* encodes a single 96bit message sent over the single channel */ +struct apple_mbox_msg { + u64 msg0; + u32 msg1; +}; + +#endif -- cgit v1.2.3 From 97961f78e8bc7f50ff7113fec030af6fa5f004d0 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 22 Oct 2021 18:18:56 +0800 Subject: mailbox: imx: support i.MX8ULP S4 MU Like i.MX8 SCU, i.MX8ULP S4 also has vendor specific protocol. - bind SCU/S4 MU part to share one tx/rx/init API to make code simple. - S4 msg max size is very large, so alloc the space at driver probe, not use local on stack variable. - S4 MU has 8 TR and 4 RR which is different with i.MX8 MU, so adapt code to reflect this. Tested on i.MX8MP, i.MX8ULP Signed-off-by: Peng Fan Signed-off-by: Jassi Brar --- include/linux/firmware/imx/s4.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 include/linux/firmware/imx/s4.h (limited to 'include/linux') diff --git a/include/linux/firmware/imx/s4.h b/include/linux/firmware/imx/s4.h new file mode 100644 index 000000000000..9e34923ae1d6 --- /dev/null +++ b/include/linux/firmware/imx/s4.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright 2021 NXP + * + * Header file for the IPC implementation. + */ + +#ifndef _S4_IPC_H +#define _S4_IPC_H + +struct imx_s4_ipc; + +struct imx_s4_rpc_msg { + uint8_t ver; + uint8_t size; + uint8_t cmd; + uint8_t tag; +} __packed; + +#endif /* _S4_IPC_H */ -- cgit v1.2.3 From cc8d7b4aea79df7cb45b74f9bc5b8a8bd2ed4c07 Mon Sep 17 00:00:00 2001 From: Anssi Hannula Date: Wed, 27 Oct 2021 13:21:24 +0300 Subject: tty: Fix extra "not" in TTY_DRIVER_REAL_RAW description TTY_DRIVER_REAL_RAW flag (which is always set for e.g. serial ports) documentation says that driver must always set special character handling flags in certain conditions. However, as the following sentence makes clear, what is actually intended is the opposite. Fix that by removing the unintended double negation. Acked-by: Johan Hovold Signed-off-by: Anssi Hannula Link: https://lore.kernel.org/r/20211027102124.3049414-1-anssi.hannula@bitwise.fi Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 29e1cf178afb..795b94ccdeb6 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -360,7 +360,7 @@ static inline void tty_set_operations(struct tty_driver *driver, * Used for PTY's, in particular. * * TTY_DRIVER_REAL_RAW --- if set, indicates that the driver will - * guarantee never not to set any special character handling + * guarantee never to set any special character handling * flags if ((IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || * !INPCK)). That is, if there is no reason for the driver to * send notifications of parity and break characters up to the -- cgit v1.2.3 From f98a3dccfcb0b9b9c3bef8df9edd61cda80ad937 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Oct 2021 13:59:38 +0200 Subject: locking: Remove spin_lock_flags() etc parisc, ia64 and powerpc32 are the only remaining architectures that provide custom arch_{spin,read,write}_lock_flags() functions, which are meant to re-enable interrupts while waiting for a spinlock. However, none of these can actually run into this codepath, because it is only called on architectures without CONFIG_GENERIC_LOCKBREAK, or when CONFIG_DEBUG_LOCK_ALLOC is set without CONFIG_LOCKDEP, and none of those combinations are possible on the three architectures. Going back in the git history, it appears that arch/mn10300 may have been able to run into this code path, but there is a good chance that it never worked. On the architectures that still exist, it was already impossible to hit back in 2008 after the introduction of CONFIG_GENERIC_LOCKBREAK, and possibly earlier. As this is all dead code, just remove it and the helper functions built around it. For arch/ia64, the inline asm could be cleaned up, but it seems safer to leave it untouched. Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20211022120058.1031690-1-arnd@kernel.org --- include/linux/lockdep.h | 17 ----------------- include/linux/rwlock.h | 15 --------------- include/linux/rwlock_api_smp.h | 6 ++---- include/linux/spinlock.h | 13 ------------- include/linux/spinlock_api_smp.h | 9 --------- include/linux/spinlock_up.h | 1 - 6 files changed, 2 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 9fe165beb0f9..467b94257105 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -481,23 +481,6 @@ do { \ #endif /* CONFIG_LOCK_STAT */ -#ifdef CONFIG_LOCKDEP - -/* - * On lockdep we dont want the hand-coded irq-enable of - * _raw_*_lock_flags() code, because lockdep assumes - * that interrupts are not re-enabled during lock-acquire: - */ -#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ - LOCK_CONTENDED((_lock), (try), (lock)) - -#else /* CONFIG_LOCKDEP */ - -#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ - lockfl((_lock), (flags)) - -#endif /* CONFIG_LOCKDEP */ - #ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 7ce9a51ae5c0..2c0ad417ce3c 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -30,31 +30,16 @@ do { \ #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_read_lock(rwlock_t *lock) __acquires(lock); -#define do_raw_read_lock_flags(lock, flags) do_raw_read_lock(lock) extern int do_raw_read_trylock(rwlock_t *lock); extern void do_raw_read_unlock(rwlock_t *lock) __releases(lock); extern void do_raw_write_lock(rwlock_t *lock) __acquires(lock); -#define do_raw_write_lock_flags(lock, flags) do_raw_write_lock(lock) extern int do_raw_write_trylock(rwlock_t *lock); extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock); #else - -#ifndef arch_read_lock_flags -# define arch_read_lock_flags(lock, flags) arch_read_lock(lock) -#endif - -#ifndef arch_write_lock_flags -# define arch_write_lock_flags(lock, flags) arch_write_lock(lock) -#endif - # define do_raw_read_lock(rwlock) do {__acquire(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0) -# define do_raw_read_lock_flags(lock, flags) \ - do {__acquire(lock); arch_read_lock_flags(&(lock)->raw_lock, *(flags)); } while (0) # define do_raw_read_trylock(rwlock) arch_read_trylock(&(rwlock)->raw_lock) # define do_raw_read_unlock(rwlock) do {arch_read_unlock(&(rwlock)->raw_lock); __release(lock); } while (0) # define do_raw_write_lock(rwlock) do {__acquire(lock); arch_write_lock(&(rwlock)->raw_lock); } while (0) -# define do_raw_write_lock_flags(lock, flags) \ - do {__acquire(lock); arch_write_lock_flags(&(lock)->raw_lock, *(flags)); } while (0) # define do_raw_write_trylock(rwlock) arch_write_trylock(&(rwlock)->raw_lock) # define do_raw_write_unlock(rwlock) do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0) #endif diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index abfb53ab11be..f1db6f17c4fb 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -157,8 +157,7 @@ static inline unsigned long __raw_read_lock_irqsave(rwlock_t *lock) local_irq_save(flags); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, do_raw_read_trylock, do_raw_read_lock, - do_raw_read_lock_flags, &flags); + LOCK_CONTENDED(lock, do_raw_read_trylock, do_raw_read_lock); return flags; } @@ -184,8 +183,7 @@ static inline unsigned long __raw_write_lock_irqsave(rwlock_t *lock) local_irq_save(flags); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, do_raw_write_trylock, do_raw_write_lock, - do_raw_write_lock_flags, &flags); + LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); return flags; } diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 45310ea1b1d7..f0447062eecd 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -177,7 +177,6 @@ do { \ #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); -#define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock) extern int do_raw_spin_trylock(raw_spinlock_t *lock); extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock); #else @@ -188,18 +187,6 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) mmiowb_spin_lock(); } -#ifndef arch_spin_lock_flags -#define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -#endif - -static inline void -do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lock) -{ - __acquire(lock); - arch_spin_lock_flags(&lock->raw_lock, *flags); - mmiowb_spin_lock(); -} - static inline int do_raw_spin_trylock(raw_spinlock_t *lock) { int ret = arch_spin_trylock(&(lock)->raw_lock); diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 6b8e1a0b137b..51fa0dab68c4 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -108,16 +108,7 @@ static inline unsigned long __raw_spin_lock_irqsave(raw_spinlock_t *lock) local_irq_save(flags); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - /* - * On lockdep we dont want the hand-coded irq-enable of - * do_raw_spin_lock_flags() code, because lockdep assumes - * that interrupts are not re-enabled during lock-acquire: - */ -#ifdef CONFIG_LOCKDEP LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); -#else - do_raw_spin_lock_flags(lock, &flags); -#endif return flags; } diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 0ac9112c1bbe..16521074b6f7 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -62,7 +62,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) #define arch_spin_is_locked(lock) ((void)(lock), 0) /* for sched/core.c and kernel_lock.c: */ # define arch_spin_lock(lock) do { barrier(); (void)(lock); } while (0) -# define arch_spin_lock_flags(lock, flags) do { barrier(); (void)(lock); } while (0) # define arch_spin_unlock(lock) do { barrier(); (void)(lock); } while (0) # define arch_spin_trylock(lock) ({ barrier(); (void)(lock); 1; }) #endif /* DEBUG_SPINLOCK */ -- cgit v1.2.3 From 9cc2fa4f4a92ccc6760d764e7341be46ee8aaaa1 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 5 Oct 2021 00:05:43 +0200 Subject: task_stack: Fix end_of_stack() for architectures with upwards-growing stack The function end_of_stack() returns a pointer to the last entry of a stack. For architectures like parisc where the stack grows upwards return the pointer to the highest address in the stack. Without this change I faced a crash on parisc, because the stackleak functionality wrote STACKLEAK_POISON to the lowest address and thus overwrote the first 4 bytes of the task_struct which included the TIF_FLAGS. Signed-off-by: Helge Deller --- include/linux/sched/task_stack.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index 2413427e439c..d10150587d81 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -25,7 +25,11 @@ static inline void *task_stack_page(const struct task_struct *task) static inline unsigned long *end_of_stack(const struct task_struct *task) { +#ifdef CONFIG_STACK_GROWSUP + return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1; +#else return task->stack; +#endif } #elif !defined(__HAVE_THREAD_FUNCTIONS) -- cgit v1.2.3 From e60b56e46b384cee1ad34e6adc164d883049c6c3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 19 Oct 2021 14:35:35 +0200 Subject: sched/fair: Wait before decaying max_newidle_lb_cost Decay max_newidle_lb_cost only when it has not been updated for a while and ensure to not decay a recently changed value. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20211019123537.17146-4-vincent.guittot@linaro.org --- include/linux/sched/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 2f9166f6dec8..c07bfa2d80f2 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -105,7 +105,7 @@ struct sched_domain { /* idle_balance() stats */ u64 max_newidle_lb_cost; - unsigned long next_decay_max_lb_cost; + unsigned long last_decay_max_lb_cost; u64 avg_scan_cost; /* select_idle_sibling */ -- cgit v1.2.3 From 7ff22787ba49c2e66dcec92f3e2b79ef6b6a0d71 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Mon, 4 Oct 2021 10:07:09 -0700 Subject: platform/chrome: cros_ec_proto: Use EC struct for features The Chrome EC's features are returned through an ec_response_get_features struct, but they are stored in an independent array. Although the two are effectively the same at present (2 unsigned 32 bit ints), there is the possibility that they could go out of sync. Avoid this by only using the EC struct to store the features. Signed-off-by: Prashant Malani Link: https://lore.kernel.org/r/20211004170716.86601-1-pmalani@chromium.org Signed-off-by: Benson Leung --- include/linux/platform_data/cros_ec_proto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 9d370816a419..df3c78c92ca2 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -205,7 +205,7 @@ struct cros_ec_dev { struct cros_ec_debugfs *debug_info; bool has_kb_wake_angle; u16 cmd_offset; - u32 features[2]; + struct ec_response_get_features features; }; #define to_cros_ec_dev(dev) container_of(dev, struct cros_ec_dev, class_dev) -- cgit v1.2.3 From d89c8169bd7052c78731137da4c4c06986409c62 Mon Sep 17 00:00:00 2001 From: Wu Zongyong Date: Fri, 29 Oct 2021 17:14:42 +0800 Subject: virtio-pci: introduce legacy device module Split common codes from virtio-pci-legacy so vDPA driver can reuse it later. Signed-off-by: Wu Zongyong Acked-by: Jason Wang Link: https://lore.kernel.org/r/71605acde5e97fcb2760a6973e406279fb1bbd33.1635493219.git.wuzongyong@linux.alibaba.com Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_pci_legacy.h | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 include/linux/virtio_pci_legacy.h (limited to 'include/linux') diff --git a/include/linux/virtio_pci_legacy.h b/include/linux/virtio_pci_legacy.h new file mode 100644 index 000000000000..e5d665faf00e --- /dev/null +++ b/include/linux/virtio_pci_legacy.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VIRTIO_PCI_LEGACY_H +#define _LINUX_VIRTIO_PCI_LEGACY_H + +#include "linux/mod_devicetable.h" +#include +#include + +struct virtio_pci_legacy_device { + struct pci_dev *pci_dev; + + /* Where to read and clear interrupt */ + u8 __iomem *isr; + /* The IO mapping for the PCI config space (legacy mode only) */ + void __iomem *ioaddr; + + struct virtio_device_id id; +}; + +u64 vp_legacy_get_features(struct virtio_pci_legacy_device *ldev); +u64 vp_legacy_get_driver_features(struct virtio_pci_legacy_device *ldev); +void vp_legacy_set_features(struct virtio_pci_legacy_device *ldev, + u32 features); +u8 vp_legacy_get_status(struct virtio_pci_legacy_device *ldev); +void vp_legacy_set_status(struct virtio_pci_legacy_device *ldev, + u8 status); +u16 vp_legacy_queue_vector(struct virtio_pci_legacy_device *ldev, + u16 idx, u16 vector); +u16 vp_legacy_config_vector(struct virtio_pci_legacy_device *ldev, + u16 vector); +void vp_legacy_set_queue_address(struct virtio_pci_legacy_device *ldev, + u16 index, u32 queue_pfn); +bool vp_legacy_get_queue_enable(struct virtio_pci_legacy_device *ldev, + u16 idx); +void vp_legacy_set_queue_size(struct virtio_pci_legacy_device *ldev, + u16 idx, u16 size); +u16 vp_legacy_get_queue_size(struct virtio_pci_legacy_device *ldev, + u16 idx); +int vp_legacy_probe(struct virtio_pci_legacy_device *ldev); +void vp_legacy_remove(struct virtio_pci_legacy_device *ldev); + +#endif -- cgit v1.2.3 From d0ae1fbfcff48e889bf993ba16890e30f6615593 Mon Sep 17 00:00:00 2001 From: Wu Zongyong Date: Fri, 29 Oct 2021 17:14:43 +0800 Subject: vdpa: fix typo Signed-off-by: Wu Zongyong Acked-by: Jason Wang Link: https://lore.kernel.org/r/4b5153262e4ba64986bb567d7425ad4829ca7bcc.1635493219.git.wuzongyong@linux.alibaba.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 3972ab765de1..a896ee021e5f 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -257,7 +257,7 @@ struct vdpa_config_ops { struct vdpa_notification_area (*get_vq_notification)(struct vdpa_device *vdev, u16 idx); /* vq irq is not expected to be changed once DRIVER_OK is set */ - int (*get_vq_irq)(struct vdpa_device *vdv, u16 idx); + int (*get_vq_irq)(struct vdpa_device *vdev, u16 idx); /* Device ops */ u32 (*get_vq_align)(struct vdpa_device *vdev); -- cgit v1.2.3 From 3b970a5842c9114c82e60744c84a7d06ee51b6f9 Mon Sep 17 00:00:00 2001 From: Wu Zongyong Date: Fri, 29 Oct 2021 17:14:45 +0800 Subject: vdpa: add new callback get_vq_num_min in vdpa_config_ops This callback is optional. For vdpa devices that not support to change virtqueue size, get_vq_num_min and get_vq_num_max will return the same value, so that users can choose a correct value for that device. Suggested-by: Jason Wang Signed-off-by: Wu Zongyong Acked-by: Jason Wang Link: https://lore.kernel.org/r/f4af5b0abd660d9a29ab6b2f67bd6df10284a230.1635493219.git.wuzongyong@linux.alibaba.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index a896ee021e5f..30864848950b 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -171,6 +171,9 @@ struct vdpa_map_file { * @get_vq_num_max: Get the max size of virtqueue * @vdev: vdpa device * Returns u16: max size of virtqueue + * @get_vq_num_min: Get the min size of virtqueue (optional) + * @vdev: vdpa device + * Returns u16: min size of virtqueue * @get_device_id: Get virtio device id * @vdev: vdpa device * Returns u32: virtio device id @@ -266,6 +269,7 @@ struct vdpa_config_ops { void (*set_config_cb)(struct vdpa_device *vdev, struct vdpa_callback *cb); u16 (*get_vq_num_max)(struct vdpa_device *vdev); + u16 (*get_vq_num_min)(struct vdpa_device *vdev); u32 (*get_device_id)(struct vdpa_device *vdev); u32 (*get_vendor_id)(struct vdpa_device *vdev); u8 (*get_status)(struct vdpa_device *vdev); -- cgit v1.2.3 From d50497eb4e554e1f0351e1836ee7241c059592e6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 19 Oct 2021 15:01:45 +0800 Subject: virtio_config: introduce a new .enable_cbs method This patch introduces a new method to enable the callbacks for config and virtqueues. This will be used for making sure the virtqueue callbacks are only enabled after virtio_device_ready() if transport implements this method. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20211019070152.8236-4-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 8519b3ae5d52..4d107ad31149 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -23,6 +23,8 @@ struct virtio_shm_region { * any of @get/@set, @get_status/@set_status, or @get_features/ * @finalize_features are NOT safe to be called from an atomic * context. + * @enable_cbs: enable the callbacks + * vdev: the virtio_device * @get: read the value of a configuration field * vdev: the virtio_device * offset: the offset of the configuration field @@ -75,6 +77,7 @@ struct virtio_shm_region { */ typedef void vq_callback_t(struct virtqueue *); struct virtio_config_ops { + void (*enable_cbs)(struct virtio_device *vdev); void (*get)(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len); void (*set)(struct virtio_device *vdev, unsigned offset, @@ -229,6 +232,9 @@ void virtio_device_ready(struct virtio_device *dev) { unsigned status = dev->config->get_status(dev); + if (dev->config->enable_cbs) + dev->config->enable_cbs(dev); + BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK); dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK); } -- cgit v1.2.3 From 939779f5152d161b34f612af29e7dc1ac4472fcf Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 27 Oct 2021 10:21:04 +0800 Subject: virtio_ring: validate used buffer length This patch validate the used buffer length provided by the device before trying to use it. This is done by record the in buffer length in a new field in desc_state structure during virtqueue_add(), then we can fail the virtqueue_get_buf() when we find the device is trying to give us a used buffer length which is greater than the in buffer length. Since some drivers have already done the validation by themselves, this patch tries to makes the core validation optional. For the driver that doesn't want the validation, it can set the suppress_used_validation to be true (which could be overridden by force_used_validation module parameter). To be more efficient, a dedicate array is used for storing the validate used length, this helps to eliminate the cache stress if validation is done by the driver. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20211027022107.14357-2-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 41edbc01ffa4..44d0e09da2d9 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -152,6 +152,7 @@ size_t virtio_max_dma_size(struct virtio_device *vdev); * @feature_table_size: number of entries in the feature table array. * @feature_table_legacy: same as feature_table but when working in legacy mode. * @feature_table_size_legacy: number of entries in feature table legacy array. + * @suppress_used_validation: set to not have core validate used length * @probe: the function to call when a device is found. Returns 0 or -errno. * @scan: optional function to call after successful probe; intended * for virtio-scsi to invoke a scan. @@ -168,6 +169,7 @@ struct virtio_driver { unsigned int feature_table_size; const unsigned int *feature_table_legacy; unsigned int feature_table_size_legacy; + bool suppress_used_validation; int (*validate)(struct virtio_device *dev); int (*probe)(struct virtio_device *dev); void (*scan)(struct virtio_device *dev); -- cgit v1.2.3 From 6dbb1f1687a2ccdfc5b84b0a35bbc6dfefc4de3b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 26 Oct 2021 20:55:12 +0300 Subject: vdpa: Introduce and use vdpa device get, set config helpers Subsequent patches enable get and set configuration either via management device or via vdpa device' config ops. This requires synchronization between multiple callers to get and set config callbacks. Features setting also influence the layout of the configuration fields endianness. To avoid exposing synchronization primitives to callers, introduce helper for setting the configuration and use it. Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20211026175519.87795-2-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 30864848950b..267236aab34c 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -386,21 +386,10 @@ static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features) return ops->set_features(vdev, features); } -static inline void vdpa_get_config(struct vdpa_device *vdev, - unsigned int offset, void *buf, - unsigned int len) -{ - const struct vdpa_config_ops *ops = vdev->config; - - /* - * Config accesses aren't supposed to trigger before features are set. - * If it does happen we assume a legacy guest. - */ - if (!vdev->features_valid) - vdpa_set_features(vdev, 0); - ops->get_config(vdev, offset, buf, len); -} - +void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, + void *buf, unsigned int len); +void vdpa_set_config(struct vdpa_device *dev, unsigned int offset, + const void *buf, unsigned int length); /** * struct vdpa_mgmtdev_ops - vdpa device ops * @dev_add: Add a vdpa device using alloc and register -- cgit v1.2.3 From ad69dd0bf26b88ec6ab26f8bbe5cd74fbed7672a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 26 Oct 2021 20:55:13 +0300 Subject: vdpa: Introduce query of device config layout Introduce a command to query a device config layout. An example query of network vdpa device: $ vdpa dev add name bar mgmtdev vdpasim_net $ vdpa dev config show bar: mac 00:35:09:19:48:05 link up link_announce false mtu 1500 $ vdpa dev config show -jp { "config": { "bar": { "mac": "00:35:09:19:48:05", "link ": "up", "link_announce ": false, "mtu": 1500, } } } Signed-off-by: Parav Pandit Signed-off-by: Eli Cohen Acked-by: Jason Wang Link: https://lore.kernel.org/r/20211026175519.87795-3-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 267236aab34c..5cc5e501397f 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -63,6 +63,7 @@ struct vdpa_mgmt_dev; * @dev: underlying device * @dma_dev: the actual device that is performing DMA * @config: the configuration ops for this device. + * @cf_mutex: Protects get and set access to configuration layout. * @index: device index * @features_valid: were features initialized? for legacy guests * @use_va: indicate whether virtual address must be used by this device @@ -74,6 +75,7 @@ struct vdpa_device { struct device dev; struct device *dma_dev; const struct vdpa_config_ops *config; + struct mutex cf_mutex; /* Protects get/set config */ unsigned int index; bool features_valid; bool use_va; -- cgit v1.2.3 From 960deb33be3d08e55a39e40e0286a51c7448e053 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 26 Oct 2021 20:55:14 +0300 Subject: vdpa: Use kernel coding style for structure comments As subsequent patch adds new structure field with comment, move the structure comment to follow kernel coding style. Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20211026175519.87795-4-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 5cc5e501397f..fafb7202482c 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -411,10 +411,17 @@ struct vdpa_mgmtdev_ops { void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev); }; +/** + * struct vdpa_mgmt_dev - vdpa management device + * @device: Management parent device + * @ops: operations supported by management device + * @id_table: Pointer to device id table of supported ids + * @list: list entry + */ struct vdpa_mgmt_dev { struct device *device; const struct vdpa_mgmtdev_ops *ops; - const struct virtio_device_id *id_table; /* supported ids */ + const struct virtio_device_id *id_table; struct list_head list; }; -- cgit v1.2.3 From d8ca2fa5be1bdb9d08cfe1f831cddb622a01dfd4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 26 Oct 2021 20:55:15 +0300 Subject: vdpa: Enable user to set mac and mtu of vdpa device $ vdpa dev add name bar mgmtdev vdpasim_net mac 00:11:22:33:44:55 mtu 9000 $ vdpa dev config show bar: mac 00:11:22:33:44:55 link up link_announce false mtu 9000 $ vdpa dev config show -jp { "config": { "bar": { "mac": "00:11:22:33:44:55", "link ": "up", "link_announce ": false, "mtu": 9000, } } } Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Acked-by: Jason Wang Link: https://lore.kernel.org/r/20211026175519.87795-5-parav@nvidia.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- include/linux/vdpa.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index fafb7202482c..c3011ccda430 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -6,6 +6,8 @@ #include #include #include +#include +#include /** * struct vdpa_calllback - vDPA callback definition. @@ -93,6 +95,14 @@ struct vdpa_iova_range { u64 last; }; +struct vdpa_dev_set_config { + struct { + u8 mac[ETH_ALEN]; + u16 mtu; + } net; + u64 mask; +}; + /** * Corresponding file area for device memory mapping * @file: vma->vm_file for the mapping @@ -397,6 +407,7 @@ void vdpa_set_config(struct vdpa_device *dev, unsigned int offset, * @dev_add: Add a vdpa device using alloc and register * @mdev: parent device to use for device addition * @name: name of the new vdpa device + * @config: config attributes to apply to the device under creation * Driver need to add a new device using _vdpa_register_device() * after fully initializing the vdpa device. Driver must return 0 * on success or appropriate error code. @@ -407,7 +418,8 @@ void vdpa_set_config(struct vdpa_device *dev, unsigned int offset, * _vdpa_unregister_device(). */ struct vdpa_mgmtdev_ops { - int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name); + int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name, + const struct vdpa_dev_set_config *config); void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev); }; @@ -416,12 +428,15 @@ struct vdpa_mgmtdev_ops { * @device: Management parent device * @ops: operations supported by management device * @id_table: Pointer to device id table of supported ids + * @config_attr_mask: bit mask of attributes of type enum vdpa_attr that + * management device support during dev_add callback * @list: list entry */ struct vdpa_mgmt_dev { struct device *device; const struct vdpa_mgmtdev_ops *ops; const struct virtio_device_id *id_table; + u64 config_attr_mask; struct list_head list; }; -- cgit v1.2.3 From 7303524e04af49a47991e19f895c3b8cdc3796c7 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Fri, 29 Oct 2021 22:12:14 +0800 Subject: skmsg: Lose offset info in sk_psock_skb_ingress If sockmap enable strparser, there are lose offset info in sk_psock_skb_ingress(). If the length determined by parse_msg function is not skb->len, the skb will be converted to sk_msg multiple times, and userspace app will get the data multiple times. Fix this by get the offset and length from strp_msg. And as Cong suggested, add one bit in skb->_sk_redir to distinguish enable or disable strparser. Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Liu Jian Signed-off-by: Daniel Borkmann Reviewed-by: Cong Wang Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20211029141216.211899-1-liujian56@huawei.com --- include/linux/skmsg.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 14ab0c0bc924..94e2a1f6e58d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -508,8 +508,22 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) #if IS_ENABLED(CONFIG_NET_SOCK_MSG) -/* We only have one bit so far. */ -#define BPF_F_PTR_MASK ~(BPF_F_INGRESS) +#define BPF_F_STRPARSER (1UL << 1) + +/* We only have two bits so far. */ +#define BPF_F_PTR_MASK ~(BPF_F_INGRESS | BPF_F_STRPARSER) + +static inline bool skb_bpf_strparser(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return sk_redir & BPF_F_STRPARSER; +} + +static inline void skb_bpf_set_strparser(struct sk_buff *skb) +{ + skb->_sk_redir |= BPF_F_STRPARSER; +} static inline bool skb_bpf_ingress(const struct sk_buff *skb) { -- cgit v1.2.3 From 588e5d8766486e52ee332a4bb097b016a355b465 Mon Sep 17 00:00:00 2001 From: He Fengqing Date: Fri, 29 Oct 2021 02:39:06 +0000 Subject: cgroup: bpf: Move wrapper for __cgroup_bpf_*() to kernel/bpf/cgroup.c In commit 324bda9e6c5a("bpf: multi program support for cgroup+bpf") cgroup_bpf_*() called from kernel/bpf/syscall.c, but now they are only used in kernel/bpf/cgroup.c, so move these function to kernel/bpf/cgroup.c, like cgroup_bpf_replace(). Signed-off-by: He Fengqing Signed-off-by: Tejun Heo --- include/linux/bpf-cgroup.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2746fd804216..9aad4e3ca29b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -157,26 +157,6 @@ struct cgroup_bpf { int cgroup_bpf_inherit(struct cgroup *cgrp); void cgroup_bpf_offline(struct cgroup *cgrp); -int __cgroup_bpf_attach(struct cgroup *cgrp, - struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, - enum bpf_attach_type type, u32 flags); -int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_cgroup_link *link, - enum bpf_attach_type type); -int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr); - -/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ -int cgroup_bpf_attach(struct cgroup *cgrp, - struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, enum bpf_attach_type type, - u32 flags); -int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type); -int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr); - int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum cgroup_bpf_attach_type atype); -- cgit v1.2.3 From ebe4560ed5c8cbfe3759f16c23ca5a6df090c6b5 Mon Sep 17 00:00:00 2001 From: Oscar Carter Date: Sat, 30 May 2020 11:08:39 +0200 Subject: firewire: Remove function callback casts In 1394 OHCI specification, Isochronous Receive DMA context has several modes. One of mode is 'BufferFill' and Linux FireWire stack uses it to receive isochronous packets for multiple isochronous channel as FW_ISO_CONTEXT_RECEIVE_MULTICHANNEL. The mode is not used by in-kernel driver, while it's available for userspace. The character device driver in firewire-core includes cast of function callback for the mode since the type of callback function is different from the other modes. The case is inconvenient to effort of Control Flow Integrity builds due to -Wcast-function-type warning. This commit removes the cast. A static helper function is newly added to initialize isochronous context for the mode. The helper function arranges isochronous context to assign specific callback function after call of existent kernel API. It's noticeable that the number of isochronous channel, speed, and the size of header are not required for the mode. The helper function is used for the mode by character device driver instead of direct call of existent kernel API. The same goal can be achieved (in the ioctl_create_iso_context function) without this helper function as follows: - Call the fw_iso_context_create function passing NULL to the callback parameter. - Then setting the context->callback.sc or context->callback.mc variables based on the a->type value. However using the helper function created in this patch makes code more clear and declarative. This way avoid the call to a function with one purpose to achieved another one. Co-developed-by: Takashi Sakamoto Signed-off-by: Takashi Sakamoto Co-developed-by: Stefan Richter Signed-off-by: Stefan Richter Signed-off-by: Oscar Carter Reviewed-by: Takashi Sakamoto Testeb-by: Takashi Sakamoto Signed-off-by: Gustavo A. R. Silva --- include/linux/firewire.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index aec8f30ab200..07967a450eaa 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -436,6 +436,12 @@ typedef void (*fw_iso_callback_t)(struct fw_iso_context *context, void *header, void *data); typedef void (*fw_iso_mc_callback_t)(struct fw_iso_context *context, dma_addr_t completed, void *data); + +union fw_iso_callback { + fw_iso_callback_t sc; + fw_iso_mc_callback_t mc; +}; + struct fw_iso_context { struct fw_card *card; int type; @@ -443,10 +449,7 @@ struct fw_iso_context { int speed; bool drop_overflow_headers; size_t header_size; - union { - fw_iso_callback_t sc; - fw_iso_mc_callback_t mc; - } callback; + union fw_iso_callback callback; void *callback_data; }; -- cgit v1.2.3 From 74128d801b5165650ae6222e8cf23780fbc55e67 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 23 Sep 2021 01:09:45 +0200 Subject: watchdog: ux500_wdt: Drop platform data Drop the platform data passing from the PRCMU driver. This platform data was part of the ambition to support more SoCs, which in turn were never mass produced. Only a name remains of the MFD cell so switch to MFD_CELL_NAME(). Cc: Lee Jones Signed-off-by: Linus Walleij Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20210922230947.1864357-1-linus.walleij@linaro.org Acked-by: Lee Jones Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/platform_data/ux500_wdt.h | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 include/linux/platform_data/ux500_wdt.h (limited to 'include/linux') diff --git a/include/linux/platform_data/ux500_wdt.h b/include/linux/platform_data/ux500_wdt.h deleted file mode 100644 index de6a4ad41e76..000000000000 --- a/include/linux/platform_data/ux500_wdt.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST Ericsson SA 2011 - * - * STE Ux500 Watchdog platform data - */ -#ifndef __UX500_WDT_H -#define __UX500_WDT_H - -/** - * struct ux500_wdt_data - */ -struct ux500_wdt_data { - unsigned int timeout; - bool has_28_bits_resolution; -}; - -#endif /* __UX500_WDT_H */ -- cgit v1.2.3 From 31a645aea4f8da5bb190ce322c6e5aacaef13855 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 25 Oct 2021 14:40:22 +0800 Subject: bpf: Factor out a helper to prepare trampoline for struct_ops prog Factor out a helper bpf_struct_ops_prepare_trampoline() to prepare trampoline for BPF_PROG_TYPE_STRUCT_OPS prog. It will be used by .test_run callback in following patch. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211025064025.2567443-2-houtao1@huawei.com --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6deebf8bf78f..aabd3540aaaf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1000,6 +1000,10 @@ bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs, + struct bpf_prog *prog, + const struct btf_func_model *model, + void *image, void *image_end); static inline bool bpf_try_module_get(const void *data, struct module *owner) { if (owner == BPF_MODULE_OWNER) -- cgit v1.2.3 From 35346ab64132d0f5919b06932d708c0d10360553 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 25 Oct 2021 14:40:23 +0800 Subject: bpf: Factor out helpers for ctx access checking Factor out two helpers to check the read access of ctx for raw tp and BTF function. bpf_tracing_ctx_access() is used to check the read access to argument is valid, and bpf_tracing_btf_ctx_access() checks whether the btf type of argument is valid besides the checking of argument read. bpf_tracing_btf_ctx_access() will be used by the following patch. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211025064025.2567443-3-houtao1@huawei.com --- include/linux/bpf.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aabd3540aaaf..67f71e7def56 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1650,6 +1650,29 @@ bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); + +static inline bool bpf_tracing_ctx_access(int off, int size, + enum bpf_access_type type) +{ + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +static inline bool bpf_tracing_btf_ctx_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (!bpf_tracing_ctx_access(off, size, type)) + return false; + return btf_ctx_access(off, size, type, prog, info); +} + int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, -- cgit v1.2.3 From c196906d50e360d82ed9aa5596a9d0ce89b7ab78 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 25 Oct 2021 14:40:24 +0800 Subject: bpf: Add dummy BPF STRUCT_OPS for test purpose Currently the test of BPF STRUCT_OPS depends on the specific bpf implementation of tcp_congestion_ops, but it can not cover all basic functionalities (e.g, return value handling), so introduce a dummy BPF STRUCT_OPS for test purpose. Loading a bpf_dummy_ops implementation from userspace is prohibited, and its only purpose is to run BPF_PROG_TYPE_STRUCT_OPS program through bpf(BPF_PROG_TEST_RUN). Now programs for test_1() & test_2() are supported. The following three cases are exercised in bpf_dummy_struct_ops_test_run(): (1) test and check the value returned from state arg in test_1(state) The content of state is copied from userspace pointer and copied back after calling test_1(state). The user pointer is saved in an u64 array and the array address is passed through ctx_in. (2) test and check the return value of test_1(NULL) Just simulate the case in which an invalid input argument is passed in. (3) test multiple arguments passing in test_2(state, ...) 5 arguments are passed through ctx_in in form of u64 array. The first element of array is userspace pointer of state and others 4 arguments follow. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211025064025.2567443-4-houtao1@huawei.com --- include/linux/bpf.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 67f71e7def56..c098089c1b54 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1018,6 +1018,22 @@ static inline void bpf_module_put(const void *data, struct module *owner) else module_put(owner); } + +#ifdef CONFIG_NET +/* Define it here to avoid the use of forward declaration */ +struct bpf_dummy_ops_state { + int val; +}; + +struct bpf_dummy_ops { + int (*test_1)(struct bpf_dummy_ops_state *cb); + int (*test_2)(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2, + char a3, unsigned long a4); +}; + +int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); +#endif #else static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id) { -- cgit v1.2.3 From 8845b4681bf44b9d2d2badf2c67cf476e42a86bd Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 29 Oct 2021 15:49:08 -0700 Subject: bpf: Add alignment padding for "map_extra" + consolidate holes This patch makes 2 changes regarding alignment padding for the "map_extra" field. 1) In the kernel header, "map_extra" and "btf_value_type_id" are rearranged to consolidate the hole. Before: struct bpf_map { ... u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ /* XXX 4 bytes hole, try to pack */ u64 map_extra; /* 48 8 */ int spin_lock_off; /* 56 4 */ int timer_off; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ u32 id; /* 64 4 */ int numa_node; /* 68 4 */ ... bool frozen; /* 117 1 */ /* XXX 10 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ ... struct work_struct work; /* 144 72 */ /* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */ struct mutex freeze_mutex; /* 216 144 */ /* --- cacheline 5 boundary (320 bytes) was 40 bytes ago --- */ u64 writecnt; /* 360 8 */ /* size: 384, cachelines: 6, members: 26 */ /* sum members: 354, holes: 2, sum holes: 14 */ /* padding: 16 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 10 */ } __attribute__((__aligned__(64))); After: struct bpf_map { ... u32 max_entries; /* 36 4 */ u64 map_extra; /* 40 8 */ u32 map_flags; /* 48 4 */ int spin_lock_off; /* 52 4 */ int timer_off; /* 56 4 */ u32 id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ int numa_node; /* 64 4 */ ... bool frozen /* 113 1 */ /* XXX 14 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ ... struct work_struct work; /* 144 72 */ /* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */ struct mutex freeze_mutex; /* 216 144 */ /* --- cacheline 5 boundary (320 bytes) was 40 bytes ago --- */ u64 writecnt; /* 360 8 */ /* size: 384, cachelines: 6, members: 26 */ /* sum members: 354, holes: 1, sum holes: 14 */ /* padding: 16 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 14 */ } __attribute__((__aligned__(64))); 2) Add alignment padding to the bpf_map_info struct More details can be found in commit 36f9814a494a ("bpf: fix uapi hole for 32 bit compat applications") Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211029224909.1721024-3-joannekoong@fb.com --- include/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c098089c1b54..f6743d4bb531 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -168,23 +168,23 @@ struct bpf_map { u32 key_size; u32 value_size; u32 max_entries; - u32 map_flags; u64 map_extra; /* any per-map-type extra fields */ + u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; + u32 btf_vmlinux_value_type_id; struct btf *btf; #ifdef CONFIG_MEMCG_KMEM struct mem_cgroup *memcg; #endif char name[BPF_OBJ_NAME_LEN]; - u32 btf_vmlinux_value_type_id; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ - /* 22 bytes hole */ + /* 14 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. -- cgit v1.2.3 From e66435936756d9bce96433be183358a8994a0f0d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 1 Nov 2021 14:56:37 -0700 Subject: mm: fix mismerge of folio page flag manipulators I had missed a semantic conflict between commit d389a4a81155 ("mm: Add folio flag manipulation functions") from the folio tree, and commit eac96c3efdb5 ("mm: filemap: check if THP has hwpoisoned subpage for PMD page fault") that added a new set of page flags. My build tests had too many options enabled, which hid this issue. But if you didn't have MEMORY_FAILURE or TRANSPARENT_HUGEPAGE enabled, you'd end up with build errors like this: include/linux/page-flags.h:806:29: error: macro "PAGEFLAG_FALSE" requires 2 arguments, but only 1 given 806 | PAGEFLAG_FALSE(HasHWPoisoned) | ^ due to the missing lowercase name used for folio function naming. Fixes: 49f8275c7d92 ("Merge tag 'folio-5.16' of git://git.infradead.org/users/willy/pagecache") Reported-by: Naresh Kamboju Reported-by: Yang Shi Cc: Matthew Wilcox Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d8623d6e1141..981341a3c3c4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -803,8 +803,8 @@ PAGEFLAG_FALSE(DoubleMap, double_map) PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) #else -PAGEFLAG_FALSE(HasHWPoisoned) - TESTSCFLAG_FALSE(HasHWPoisoned) +PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) + TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) #endif /* -- cgit v1.2.3 From f1a456f8f3fc5828d8abcad941860380ae147b1d Mon Sep 17 00:00:00 2001 From: Talal Ahmad Date: Fri, 29 Oct 2021 22:05:42 -0400 Subject: net: avoid double accounting for pure zerocopy skbs Track skbs with only zerocopy data and avoid charging them to kernel memory to correctly account the memory utilization for msg_zerocopy. All of the data in such skbs is held in user pages which are already accounted to user. Before this change, they are charged again in kernel in __zerocopy_sg_from_iter. The charging in kernel is excessive because data is not being copied into skb frags. This excessive charging can lead to kernel going into memory pressure state which impacts all sockets in the system adversely. Mark pure zerocopy skbs with a SKBFL_PURE_ZEROCOPY flag and remove charge/uncharge for data in such skbs. Initially, an skb is marked pure zerocopy when it is empty and in zerocopy path. skb can then change from a pure zerocopy skb to mixed data skb (zerocopy and copy data) if it is at tail of write queue and there is room available in it and non-zerocopy data is being sent in the next sendmsg call. At this time sk_mem_charge is done for the pure zerocopied data and the pure zerocopy flag is unmarked. We found that this happens very rarely on workloads that pass MSG_ZEROCOPY. A pure zerocopy skb can later be coalesced into normal skb if they are next to each other in queue but this patch prevents coalescing from happening. This avoids complexity of charging when skb downgrades from pure zerocopy to mixed. This is also rare. In sk_wmem_free_skb, if it is a pure zerocopy skb, an sk_mem_uncharge for SKB_TRUESIZE(MAX_TCP_HEADER) is done for sk_mem_charge in tcp_skb_entail for an skb without data. Testing with the msg_zerocopy.c benchmark between two hosts(100G nics) with zerocopy showed that before this patch the 'sock' variable in memory.stat for cgroup2 that tracks sum of sk_forward_alloc, sk_rmem_alloc and sk_wmem_queued is around 1822720 and with this change it is 0. This is due to no charge to sk_forward_alloc for zerocopy data and shows memory utilization for kernel is lowered. Signed-off-by: Talal Ahmad Acked-by: Arjun Roy Acked-by: Soheil Hassas Yeganeh Signed-off-by: Willem de Bruijn Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0bd6520329f6..10869906cc57 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -454,9 +454,15 @@ enum { * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), + + /* segment contains only zerocopy data and should not be + * charged to the kernel memory. + */ + SKBFL_PURE_ZEROCOPY = BIT(2), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) +#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY) /* * The callback notifies userspace to release buffers when skb DMA is done in @@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } +static inline bool skb_zcopy_pure(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; +} + +static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, + const struct sk_buff *skb2) +{ + return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); +} + static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); @@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); - skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG; + skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } } -- cgit v1.2.3 From fcdb44d08a95003c3d040aecdee286156ec6f34e Mon Sep 17 00:00:00 2001 From: James Prestwood Date: Mon, 1 Nov 2021 10:36:28 -0700 Subject: net: arp: introduce arp_evict_nocarrier sysctl parameter This change introduces a new sysctl parameter, arp_evict_nocarrier. When set (default) the ARP cache will be cleared on a NOCARRIER event. This new option has been defaulted to '1' which maintains existing behavior. Clearing the ARP cache on NOCARRIER is relatively new, introduced by: commit 859bd2ef1fc1110a8031b967ee656c53a6260a76 Author: David Ahern Date: Thu Oct 11 20:33:49 2018 -0700 net: Evict neighbor entries on carrier down The reason for this changes is to prevent the ARP cache from being cleared when a wireless device roams. Specifically for wireless roams the ARP cache should not be cleared because the underlying network has not changed. Clearing the ARP cache in this case can introduce significant delays sending out packets after a roam. A user reported such a situation here: https://lore.kernel.org/linux-wireless/CACsRnHWa47zpx3D1oDq9JYnZWniS8yBwW1h0WAVZ6vrbwL_S0w@mail.gmail.com/ After some investigation it was found that the kernel was holding onto packets until ARP finished which resulted in this 1 second delay. It was also found that the first ARP who-has was never responded to, which is actually what caues the delay. This change is more or less working around this behavior, but again, there is no reason to clear the cache on a roam anyways. As for the unanswered who-has, we know the packet made it OTA since it was seen while monitoring. Why it never received a response is unknown. In any case, since this is a problem on the AP side of things all that can be done is to work around it until it is solved. Some background on testing/reproducing the packet delay: Hardware: - 2 access points configured for Fast BSS Transition (Though I don't see why regular reassociation wouldn't have the same behavior) - Wireless station running IWD as supplicant - A device on network able to respond to pings (I used one of the APs) Procedure: - Connect to first AP - Ping once to establish an ARP entry - Start a tcpdump - Roam to second AP - Wait for operstate UP event, and note the timestamp - Start pinging Results: Below is the tcpdump after UP. It was recorded the interface went UP at 10:42:01.432875. 10:42:01.461871 ARP, Request who-has 192.168.254.1 tell 192.168.254.71, length 28 10:42:02.497976 ARP, Request who-has 192.168.254.1 tell 192.168.254.71, length 28 10:42:02.507162 ARP, Reply 192.168.254.1 is-at ac:86:74:55:b0:20, length 46 10:42:02.507185 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 1, length 64 10:42:02.507205 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 2, length 64 10:42:02.507212 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 3, length 64 10:42:02.507219 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 4, length 64 10:42:02.507225 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 5, length 64 10:42:02.507232 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 6, length 64 10:42:02.515373 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 1, length 64 10:42:02.521399 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 2, length 64 10:42:02.521612 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 3, length 64 10:42:02.521941 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 4, length 64 10:42:02.522419 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 5, length 64 10:42:02.523085 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 6, length 64 You can see the first ARP who-has went out very quickly after UP, but was never responded to. Nearly a second later the kernel retries and gets a response. Only then do the ping packets go out. If an ARP entry is manually added prior to UP (after the cache is cleared) it is seen that the first ping is never responded to, so its not only an issue with ARP but with data packets in general. As mentioned prior, the wireless interface was also monitored to verify the ping/ARP packet made it OTA which was observed to be true. Signed-off-by: James Prestwood Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index a038feb63f23..518b484a7f07 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -133,6 +133,8 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) #define IN_DEV_ARP_ANNOUNCE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE) #define IN_DEV_ARP_IGNORE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_IGNORE) #define IN_DEV_ARP_NOTIFY(in_dev) IN_DEV_MAXCONF((in_dev), ARP_NOTIFY) +#define IN_DEV_ARP_EVICT_NOCARRIER(in_dev) IN_DEV_ANDCONF((in_dev), \ + ARP_EVICT_NOCARRIER) struct in_ifaddr { struct hlist_node hash; -- cgit v1.2.3 From 18ac597af25e9760b76471524096f5b29eb820e6 Mon Sep 17 00:00:00 2001 From: James Prestwood Date: Mon, 1 Nov 2021 10:36:29 -0700 Subject: net: ndisc: introduce ndisc_evict_nocarrier sysctl parameter In most situations the neighbor discovery cache should be cleared on a NOCARRIER event which is currently done unconditionally. But for wireless roams the neighbor discovery cache can and should remain intact since the underlying network has not changed. This patch introduces a sysctl option ndisc_evict_nocarrier which can be disabled by a wireless supplicant during a roam. This allows packets to be sent after a roam immediately without having to wait for neighbor discovery. A user reported roughly a 1 second delay after a roam before packets could be sent out (note, on IPv4). This delay was due to the ARP cache being cleared. During testing of this same scenario using IPv6 no delay was noticed, but regardless there is no reason to clear the ndisc cache for wireless roams. Signed-off-by: James Prestwood Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index c383630d3f06..20c1f968da7c 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -79,6 +79,7 @@ struct ipv6_devconf { __u32 ioam6_id; __u32 ioam6_id_wide; __u8 ioam6_enabled; + __u8 ndisc_evict_nocarrier; struct ctl_table_header *sysctl_header; }; -- cgit v1.2.3 From 4a08e3271c55f8b5d56906a8aa5bd041911cf897 Mon Sep 17 00:00:00 2001 From: "Hector.Yuan" Date: Mon, 4 Oct 2021 22:42:33 +0800 Subject: cpufreq: Fix parameter in parse_perf_domain() Pass cpu to parse_perf_domain() instead of pcpu. Fixes: 8486a32dd484 ("cpufreq: Add of_perf_domain_get_sharing_cpumask") Signed-off-by: Hector.Yuan [ Viresh: Massaged changelog ] Signed-off-by: Viresh Kumar --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index ff88bb3e44fc..66a1f495f01a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1041,7 +1041,7 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_ if (cpu == pcpu) continue; - ret = parse_perf_domain(pcpu, list_name, cell_name); + ret = parse_perf_domain(cpu, list_name, cell_name); if (ret < 0) continue; -- cgit v1.2.3 From 84882cf72cd774cf16fd338bdbf00f69ac9f9194 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 1 Nov 2021 22:26:08 -0700 Subject: Revert "net: avoid double accounting for pure zerocopy skbs" This reverts commit f1a456f8f3fc5828d8abcad941860380ae147b1d. WARNING: CPU: 1 PID: 6819 at net/core/skbuff.c:5429 skb_try_coalesce+0x78b/0x7e0 CPU: 1 PID: 6819 Comm: xxxxxxx Kdump: loaded Tainted: G S 5.15.0-04194-gd852503f7711 #16 RIP: 0010:skb_try_coalesce+0x78b/0x7e0 Code: e8 2a bf 41 ff 44 8b b3 bc 00 00 00 48 8b 7c 24 30 e8 19 c0 41 ff 44 89 f0 48 03 83 c0 00 00 00 48 89 44 24 40 e9 47 fb ff ff <0f> 0b e9 ca fc ff ff 4c 8d 70 ff 48 83 c0 07 48 89 44 24 38 e9 61 RSP: 0018:ffff88881f449688 EFLAGS: 00010282 RAX: 00000000fffffe96 RBX: ffff8881566e4460 RCX: ffffffff82079f7e RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8881566e47b0 RBP: ffff8881566e46e0 R08: ffffed102619235d R09: ffffed102619235d R10: ffff888130c91ae3 R11: ffffed102619235c R12: ffff88881f4498a0 R13: 0000000000000056 R14: 0000000000000009 R15: ffff888130c91ac0 FS: 00007fec2cbb9700(0000) GS:ffff88881f440000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fec1b060d80 CR3: 00000003acf94005 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_try_coalesce+0xeb/0x290 ? tcp_parse_options+0x610/0x610 ? mark_held_locks+0x79/0xa0 tcp_queue_rcv+0x69/0x2f0 tcp_rcv_established+0xa49/0xd40 ? tcp_data_queue+0x18a0/0x18a0 tcp_v6_do_rcv+0x1c9/0x880 ? rt6_mtu_change_route+0x100/0x100 tcp_v6_rcv+0x1624/0x1830 Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 10869906cc57..0bd6520329f6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -454,15 +454,9 @@ enum { * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), - - /* segment contains only zerocopy data and should not be - * charged to the kernel memory. - */ - SKBFL_PURE_ZEROCOPY = BIT(2), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) -#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY) /* * The callback notifies userspace to release buffers when skb DMA is done in @@ -1470,17 +1464,6 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } -static inline bool skb_zcopy_pure(const struct sk_buff *skb) -{ - return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; -} - -static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, - const struct sk_buff *skb2) -{ - return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); -} - static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); @@ -1545,7 +1528,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); - skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; + skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG; } } -- cgit v1.2.3 From ca7752caeaa70bd31d1714af566c9809688544af Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Mon, 1 Nov 2021 17:06:15 -0400 Subject: posix-cpu-timers: Clear task::posix_cputimers_work in copy_process() copy_process currently copies task_struct.posix_cputimers_work as-is. If a timer interrupt arrives while handling clone and before dup_task_struct completes then the child task will have: 1. posix_cputimers_work.scheduled = true 2. posix_cputimers_work.work queued. copy_process clears task_struct.task_works, so (2) will have no effect and posix_cpu_timers_work will never run (not to mention it doesn't make sense for two tasks to share a common linked list). Since posix_cpu_timers_work never runs, posix_cputimers_work.scheduled is never cleared. Since scheduled is set, future timer interrupts will skip scheduling work, with the ultimate result that the task will never receive timer expirations. Together, the complete flow is: 1. Task 1 calls clone(), enters kernel. 2. Timer interrupt fires, schedules task work on Task 1. 2a. task_struct.posix_cputimers_work.scheduled = true 2b. task_struct.posix_cputimers_work.work added to task_struct.task_works. 3. dup_task_struct() copies Task 1 to Task 2. 4. copy_process() clears task_struct.task_works for Task 2. 5. Future timer interrupts on Task 2 see task_struct.posix_cputimers_work.scheduled = true and skip scheduling work. Fix this by explicitly clearing contents of task_struct.posix_cputimers_work in copy_process(). This was never meant to be shared or inherited across tasks in the first place. Fixes: 1fb497dd0030 ("posix-cpu-timers: Provide mechanisms to defer timer handling to task_work") Reported-by: Rhys Hiltner Signed-off-by: Michael Pratt Signed-off-by: Thomas Gleixner Cc: Link: https://lore.kernel.org/r/20211101210615.716522-1-mpratt@google.com --- include/linux/posix-timers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 00fef0064355..5bbcd280bfd2 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -184,8 +184,10 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct, #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +void clear_posix_cputimers_work(struct task_struct *p); void posix_cputimers_init_work(void); #else +static inline void clear_posix_cputimers_work(struct task_struct *p) { } static inline void posix_cputimers_init_work(void) { } #endif -- cgit v1.2.3 From 8791545eda52e8f3bc48e3cd902e38bf4ba4c9de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 22 Oct 2021 16:17:03 -0400 Subject: NFS: Move NFS protocol display macros to global header Refactor: surface useful show_ macros so they can be shared between the client and server trace code. Additional clean up: - Housekeeping: ensure the correct #include files are pulled in and add proper TRACE_DEFINE_ENUM where they are missing - Use a consistent naming scheme for the helpers - Store values to be displayed symbolically as unsigned long, as that is the type that the __print_yada() functions take Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/nfs4.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 15004c469807..5662d8be04eb 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -292,6 +292,10 @@ enum nfsstat4 { NFS4ERR_XATTR2BIG = 10096, }; +/* error codes for internal client use */ +#define NFS4ERR_RESET_TO_MDS 12001 +#define NFS4ERR_RESET_TO_PNFS 12002 + static inline bool seqid_mutating_err(u32 err) { /* See RFC 7530, section 9.1.7 */ -- cgit v1.2.3 From 5fe11512cdc24ccc66ac5da3c815ac9e59449abc Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 2 Nov 2021 16:33:13 -0700 Subject: Input: remove unused header Commit 83b41248ed04 ("Input: cy8ctmg110_ts - switch to using gpiod API") remove the last use of but left the header file behind. Nothing uses it now, delete it. Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20211102220203.940290-6-corbet@lwn.net Signed-off-by: Dmitry Torokhov --- include/linux/input/cy8ctmg110_pdata.h | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 include/linux/input/cy8ctmg110_pdata.h (limited to 'include/linux') diff --git a/include/linux/input/cy8ctmg110_pdata.h b/include/linux/input/cy8ctmg110_pdata.h deleted file mode 100644 index ee1d44545f30..000000000000 --- a/include/linux/input/cy8ctmg110_pdata.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CY8CTMG110_PDATA_H -#define _LINUX_CY8CTMG110_PDATA_H - -struct cy8ctmg110_pdata -{ - int reset_pin; /* Reset pin is wired to this GPIO (optional) */ -}; - -#endif -- cgit v1.2.3 From c4777efa751d293e369aec464ce6875e957be255 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Nov 2021 17:45:55 -0700 Subject: net: add and use skb_unclone_keeptruesize() helper While commit 097b9146c0e2 ("net: fix up truesize of cloned skb in skb_prepare_for_shift()") fixed immediate issues found when KFENCE was enabled/tested, there are still similar issues, when tcp_trim_head() hits KFENCE while the master skb is cloned. This happens under heavy networking TX workloads, when the TX completion might be delayed after incoming ACK. This patch fixes the WARNING in sk_stream_kill_queues when sk->sk_mem_queued/sk->sk_forward_alloc are not zero. Fixes: d3fb45f370d9 ("mm, kfence: insert KFENCE hooks for SLAB") Signed-off-by: Eric Dumazet Acked-by: Marco Elver Link: https://lore.kernel.org/r/20211102004555.1359210-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0bd6520329f6..a63e13082397 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1675,6 +1675,22 @@ static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) return 0; } +/* This variant of skb_unclone() makes sure skb->truesize is not changed */ +static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + might_sleep_if(gfpflags_allow_blocking(pri)); + + if (skb_cloned(skb)) { + unsigned int save = skb->truesize; + int res; + + res = pskb_expand_head(skb, 0, 0, pri); + skb->truesize = save; + return res; + } + return 0; +} + /** * skb_header_cloned - is the header a clone * @skb: buffer to check -- cgit v1.2.3 From 0dc54bd4d6e03be1f0b678c4297170b79f1a44ab Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Wed, 3 Nov 2021 17:34:05 +0900 Subject: fscache_cookie_enabled: check cookie is valid before accessing it fscache_cookie_enabled() could be called on NULL cookies and cause a null pointer dereference when accessing cookie flags: just make sure the cookie is valid first Suggested-by: David Howells Acked-by: David Howells Signed-off-by: Dominique Martinet --- include/linux/fscache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index a4dab5998613..3b2282c157f7 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -167,7 +167,7 @@ struct fscache_cookie { static inline bool fscache_cookie_enabled(struct fscache_cookie *cookie) { - return test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); + return fscache_cookie_valid(cookie) && test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } /* -- cgit v1.2.3 From c081d53f97a1a90a38e4296dd3d6fda5e38dca2c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 2 Nov 2021 08:02:47 -0400 Subject: security: pass asoc to sctp_assoc_request and sctp_sk_clone This patch is to move secid and peer_secid from endpoint to association, and pass asoc to sctp_assoc_request and sctp_sk_clone instead of ep. As ep is the local endpoint and asoc represents a connection, and in SCTP one sk/ep could have multiple asoc/connection, saving secid/peer_secid for new asoc will overwrite the old asoc's. Note that since asoc can be passed as NULL, security_sctp_assoc_request() is moved to the place right after the new_asoc is created in sctp_sf_do_5_1B_init() and sctp_sf_do_unexpected_init(). v1->v2: - fix the description of selinux_netlbl_skbuff_setsid(), as Jakub noticed. - fix the annotation in selinux_sctp_assoc_request(), as Richard Noticed. Fixes: 72e89f50084c ("security: Add support for SCTP security hooks") Reported-by: Prashanth Prahlad Reviewed-by: Richard Haines Tested-by: Richard Haines Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/lsm_hook_defs.h | 4 ++-- include/linux/lsm_hooks.h | 8 ++++---- include/linux/security.h | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index a9ac70ae01ab..df8de62f4710 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -329,11 +329,11 @@ LSM_HOOK(int, 0, tun_dev_create, void) LSM_HOOK(int, 0, tun_dev_attach_queue, void *security) LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security) LSM_HOOK(int, 0, tun_dev_open, void *security) -LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_endpoint *ep, +LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_association *asoc, struct sk_buff *skb) LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) -LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_endpoint *ep, +LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) #endif /* CONFIG_SECURITY_NETWORK */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 0bada4df23fc..d45b6f6e27fd 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1027,9 +1027,9 @@ * Security hooks for SCTP * * @sctp_assoc_request: - * Passes the @ep and @chunk->skb of the association INIT packet to + * Passes the @asoc and @chunk->skb of the association INIT packet to * the security module. - * @ep pointer to sctp endpoint structure. + * @asoc pointer to sctp association structure. * @skb pointer to skbuff of association packet. * Return 0 on success, error on failure. * @sctp_bind_connect: @@ -1047,9 +1047,9 @@ * Called whenever a new socket is created by accept(2) (i.e. a TCP * style socket) or when a socket is 'peeled off' e.g userspace * calls sctp_peeloff(3). - * @ep pointer to current sctp endpoint structure. + * @asoc pointer to current sctp association structure. * @sk pointer to current sock structure. - * @sk pointer to new sock structure. + * @newsk pointer to new sock structure. * * Security hooks for Infiniband * diff --git a/include/linux/security.h b/include/linux/security.h index 7e0ba63b5dde..bbf44a466832 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -179,7 +179,7 @@ struct xfrm_policy; struct xfrm_state; struct xfrm_user_sec_ctx; struct seq_file; -struct sctp_endpoint; +struct sctp_association; #ifdef CONFIG_MMU extern unsigned long mmap_min_addr; @@ -1425,10 +1425,10 @@ int security_tun_dev_create(void); int security_tun_dev_attach_queue(void *security); int security_tun_dev_attach(struct sock *sk, void *security); int security_tun_dev_open(void *security); -int security_sctp_assoc_request(struct sctp_endpoint *ep, struct sk_buff *skb); +int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb); int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen); -void security_sctp_sk_clone(struct sctp_endpoint *ep, struct sock *sk, +void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk); #else /* CONFIG_SECURITY_NETWORK */ @@ -1631,7 +1631,7 @@ static inline int security_tun_dev_open(void *security) return 0; } -static inline int security_sctp_assoc_request(struct sctp_endpoint *ep, +static inline int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { return 0; @@ -1644,7 +1644,7 @@ static inline int security_sctp_bind_connect(struct sock *sk, int optname, return 0; } -static inline void security_sctp_sk_clone(struct sctp_endpoint *ep, +static inline void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { -- cgit v1.2.3 From 7c2ef0240e6abfd3cc59511339517358350a8910 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 2 Nov 2021 08:02:49 -0400 Subject: security: add sctp_assoc_established hook security_sctp_assoc_established() is added to replace security_inet_conn_established() called in sctp_sf_do_5_1E_ca(), so that asoc can be accessed in security subsystem and save the peer secid to asoc->peer_secid. v1->v2: - fix the return value of security_sctp_assoc_established() in security.h, found by kernel test robot and Ondrej. Fixes: 72e89f50084c ("security: Add support for SCTP security hooks") Reported-by: Prashanth Prahlad Reviewed-by: Richard Haines Tested-by: Richard Haines Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/lsm_hooks.h | 5 +++++ include/linux/security.h | 7 +++++++ 3 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index df8de62f4710..442a611fa0fb 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -335,6 +335,8 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) +LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc, + struct sk_buff *skb) #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d45b6f6e27fd..d6823214d5c1 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1050,6 +1050,11 @@ * @asoc pointer to current sctp association structure. * @sk pointer to current sock structure. * @newsk pointer to new sock structure. + * @sctp_assoc_established: + * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet + * to the security module. + * @asoc pointer to sctp association structure. + * @skb pointer to skbuff of association packet. * * Security hooks for Infiniband * diff --git a/include/linux/security.h b/include/linux/security.h index bbf44a466832..06eac4e61a13 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1430,6 +1430,8 @@ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen); void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk); +void security_sctp_assoc_established(struct sctp_association *asoc, + struct sk_buff *skb); #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct sock *sock, @@ -1649,6 +1651,11 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *newsk) { } + +static inline void security_sctp_assoc_established(struct sctp_association *asoc, + struct sk_buff *skb) +{ +} #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND -- cgit v1.2.3 From 9b65b17db72313b7a4fe9bc9502928c88be57986 Mon Sep 17 00:00:00 2001 From: Talal Ahmad Date: Tue, 2 Nov 2021 22:58:44 -0400 Subject: net: avoid double accounting for pure zerocopy skbs Track skbs containing only zerocopy data and avoid charging them to kernel memory to correctly account the memory utilization for msg_zerocopy. All of the data in such skbs is held in user pages which are already accounted to user. Before this change, they are charged again in kernel in __zerocopy_sg_from_iter. The charging in kernel is excessive because data is not being copied into skb frags. This excessive charging can lead to kernel going into memory pressure state which impacts all sockets in the system adversely. Mark pure zerocopy skbs with a SKBFL_PURE_ZEROCOPY flag and remove charge/uncharge for data in such skbs. Initially, an skb is marked pure zerocopy when it is empty and in zerocopy path. skb can then change from a pure zerocopy skb to mixed data skb (zerocopy and copy data) if it is at tail of write queue and there is room available in it and non-zerocopy data is being sent in the next sendmsg call. At this time sk_mem_charge is done for the pure zerocopied data and the pure zerocopy flag is unmarked. We found that this happens very rarely on workloads that pass MSG_ZEROCOPY. A pure zerocopy skb can later be coalesced into normal skb if they are next to each other in queue but this patch prevents coalescing from happening. This avoids complexity of charging when skb downgrades from pure zerocopy to mixed. This is also rare. In sk_wmem_free_skb, if it is a pure zerocopy skb, an sk_mem_uncharge for SKB_TRUESIZE(skb_end_offset(skb)) is done for sk_mem_charge in tcp_skb_entail for an skb without data. Testing with the msg_zerocopy.c benchmark between two hosts(100G nics) with zerocopy showed that before this patch the 'sock' variable in memory.stat for cgroup2 that tracks sum of sk_forward_alloc, sk_rmem_alloc and sk_wmem_queued is around 1822720 and with this change it is 0. This is due to no charge to sk_forward_alloc for zerocopy data and shows memory utilization for kernel is lowered. With this commit we don't see the warning we saw in previous commit which resulted in commit 84882cf72cd774cf16fd338bdbf00f69ac9f9194. Signed-off-by: Talal Ahmad Acked-by: Arjun Roy Acked-by: Soheil Hassas Yeganeh Signed-off-by: Willem de Bruijn Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a63e13082397..686a666d073d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -454,9 +454,15 @@ enum { * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), + + /* segment contains only zerocopy data and should not be + * charged to the kernel memory. + */ + SKBFL_PURE_ZEROCOPY = BIT(2), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) +#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY) /* * The callback notifies userspace to release buffers when skb DMA is done in @@ -1464,6 +1470,17 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) return is_zcopy ? skb_uarg(skb) : NULL; } +static inline bool skb_zcopy_pure(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; +} + +static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, + const struct sk_buff *skb2) +{ + return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); +} + static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); @@ -1528,7 +1545,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); - skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG; + skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } } -- cgit v1.2.3 From 1aabe578dd86e9f2867c4db4fba9a15f4ba1825d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 2 Nov 2021 15:02:36 -0700 Subject: ethtool: fix ethtool msg len calculation for pause stats ETHTOOL_A_PAUSE_STAT_MAX is the MAX attribute id, so we need to subtract non-stats and add one to get a count (IOW -2+1 == -1). Otherwise we'll see: ethnl cmd 21: calculated reply length 40, but consumed 52 Fixes: 9a27a33027f2 ("ethtool: add standard pause stats") Signed-off-by: Jakub Kicinski Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/ethtool_netlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index 1e7bf78cb382..aba348d58ff6 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -10,6 +10,9 @@ #define __ETHTOOL_LINK_MODE_MASK_NWORDS \ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) +#define ETHTOOL_PAUSE_STAT_CNT (__ETHTOOL_A_PAUSE_STAT_CNT - \ + ETHTOOL_A_PAUSE_STAT_TX_FRAMES) + enum ethtool_multicast_groups { ETHNL_MCGRP_MONITOR, }; -- cgit v1.2.3 From 4330fe35b8213e92ff51907b4cb6323be943a9ad Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 2 Nov 2021 16:01:56 -0600 Subject: nfs: remove unused header Commit 19fcae3d4f2dd ("scsi: remove the SCSI OSD library") deleted the last file that included but left that file behind. It's unused, get rid of it now. Cc: Christoph Hellwig Cc: Trond Myklebust Cc: Anna Schumaker Cc: linux-nfs@vger.kernel.org Signed-off-by: Jonathan Corbet Signed-off-by: Trond Myklebust --- include/linux/pnfs_osd_xdr.h | 317 ------------------------------------------- 1 file changed, 317 deletions(-) delete mode 100644 include/linux/pnfs_osd_xdr.h (limited to 'include/linux') diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h deleted file mode 100644 index 17d7d0d20eca..000000000000 --- a/include/linux/pnfs_osd_xdr.h +++ /dev/null @@ -1,317 +0,0 @@ -/* - * pNFS-osd on-the-wire data structures - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy - * Boaz Harrosh - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef __PNFS_OSD_XDR_H__ -#define __PNFS_OSD_XDR_H__ - -#include - -/* - * draft-ietf-nfsv4-minorversion-22 - * draft-ietf-nfsv4-pnfs-obj-12 - */ - -/* Layout Structure */ - -enum pnfs_osd_raid_algorithm4 { - PNFS_OSD_RAID_0 = 1, - PNFS_OSD_RAID_4 = 2, - PNFS_OSD_RAID_5 = 3, - PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ -}; - -/* struct pnfs_osd_data_map4 { - * uint32_t odm_num_comps; - * length4 odm_stripe_unit; - * uint32_t odm_group_width; - * uint32_t odm_group_depth; - * uint32_t odm_mirror_cnt; - * pnfs_osd_raid_algorithm4 odm_raid_algorithm; - * }; - */ -struct pnfs_osd_data_map { - u32 odm_num_comps; - u64 odm_stripe_unit; - u32 odm_group_width; - u32 odm_group_depth; - u32 odm_mirror_cnt; - u32 odm_raid_algorithm; -}; - -/* struct pnfs_osd_objid4 { - * deviceid4 oid_device_id; - * uint64_t oid_partition_id; - * uint64_t oid_object_id; - * }; - */ -struct pnfs_osd_objid { - struct nfs4_deviceid oid_device_id; - u64 oid_partition_id; - u64 oid_object_id; -}; - -/* For printout. I use: - * kprint("dev(%llx:%llx)", _DEVID_LO(pointer), _DEVID_HI(pointer)); - * BE style - */ -#define _DEVID_LO(oid_device_id) \ - (unsigned long long)be64_to_cpup((__be64 *)(oid_device_id)->data) - -#define _DEVID_HI(oid_device_id) \ - (unsigned long long)be64_to_cpup(((__be64 *)(oid_device_id)->data) + 1) - -enum pnfs_osd_version { - PNFS_OSD_MISSING = 0, - PNFS_OSD_VERSION_1 = 1, - PNFS_OSD_VERSION_2 = 2 -}; - -struct pnfs_osd_opaque_cred { - u32 cred_len; - void *cred; -}; - -enum pnfs_osd_cap_key_sec { - PNFS_OSD_CAP_KEY_SEC_NONE = 0, - PNFS_OSD_CAP_KEY_SEC_SSV = 1, -}; - -/* struct pnfs_osd_object_cred4 { - * pnfs_osd_objid4 oc_object_id; - * pnfs_osd_version4 oc_osd_version; - * pnfs_osd_cap_key_sec4 oc_cap_key_sec; - * opaque oc_capability_key<>; - * opaque oc_capability<>; - * }; - */ -struct pnfs_osd_object_cred { - struct pnfs_osd_objid oc_object_id; - u32 oc_osd_version; - u32 oc_cap_key_sec; - struct pnfs_osd_opaque_cred oc_cap_key; - struct pnfs_osd_opaque_cred oc_cap; -}; - -/* struct pnfs_osd_layout4 { - * pnfs_osd_data_map4 olo_map; - * uint32_t olo_comps_index; - * pnfs_osd_object_cred4 olo_components<>; - * }; - */ -struct pnfs_osd_layout { - struct pnfs_osd_data_map olo_map; - u32 olo_comps_index; - u32 olo_num_comps; - struct pnfs_osd_object_cred *olo_comps; -}; - -/* Device Address */ -enum pnfs_osd_targetid_type { - OBJ_TARGET_ANON = 1, - OBJ_TARGET_SCSI_NAME = 2, - OBJ_TARGET_SCSI_DEVICE_ID = 3, -}; - -/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { - * case OBJ_TARGET_SCSI_NAME: - * string oti_scsi_name<>; - * - * case OBJ_TARGET_SCSI_DEVICE_ID: - * opaque oti_scsi_device_id<>; - * - * default: - * void; - * }; - * - * union pnfs_osd_targetaddr4 switch (bool ota_available) { - * case TRUE: - * netaddr4 ota_netaddr; - * case FALSE: - * void; - * }; - * - * struct pnfs_osd_deviceaddr4 { - * pnfs_osd_targetid4 oda_targetid; - * pnfs_osd_targetaddr4 oda_targetaddr; - * uint64_t oda_lun; - * opaque oda_systemid<>; - * pnfs_osd_object_cred4 oda_root_obj_cred; - * opaque oda_osdname<>; - * }; - */ -struct pnfs_osd_targetid { - u32 oti_type; - struct nfs4_string oti_scsi_device_id; -}; - -/* struct netaddr4 { - * // see struct rpcb in RFC1833 - * string r_netid<>; // network id - * string r_addr<>; // universal address - * }; - */ -struct pnfs_osd_net_addr { - struct nfs4_string r_netid; - struct nfs4_string r_addr; -}; - -struct pnfs_osd_targetaddr { - u32 ota_available; - struct pnfs_osd_net_addr ota_netaddr; -}; - -struct pnfs_osd_deviceaddr { - struct pnfs_osd_targetid oda_targetid; - struct pnfs_osd_targetaddr oda_targetaddr; - u8 oda_lun[8]; - struct nfs4_string oda_systemid; - struct pnfs_osd_object_cred oda_root_obj_cred; - struct nfs4_string oda_osdname; -}; - -/* LAYOUTCOMMIT: layoutupdate */ - -/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { - * case TRUE: - * int64_t dsu_delta; - * case FALSE: - * void; - * }; - * - * struct pnfs_osd_layoutupdate4 { - * pnfs_osd_deltaspaceused4 olu_delta_space_used; - * bool olu_ioerr_flag; - * }; - */ -struct pnfs_osd_layoutupdate { - u32 dsu_valid; - s64 dsu_delta; - u32 olu_ioerr_flag; -}; - -/* LAYOUTRETURN: I/O Rrror Report */ - -enum pnfs_osd_errno { - PNFS_OSD_ERR_EIO = 1, - PNFS_OSD_ERR_NOT_FOUND = 2, - PNFS_OSD_ERR_NO_SPACE = 3, - PNFS_OSD_ERR_BAD_CRED = 4, - PNFS_OSD_ERR_NO_ACCESS = 5, - PNFS_OSD_ERR_UNREACHABLE = 6, - PNFS_OSD_ERR_RESOURCE = 7 -}; - -/* struct pnfs_osd_ioerr4 { - * pnfs_osd_objid4 oer_component; - * length4 oer_comp_offset; - * length4 oer_comp_length; - * bool oer_iswrite; - * pnfs_osd_errno4 oer_errno; - * }; - */ -struct pnfs_osd_ioerr { - struct pnfs_osd_objid oer_component; - u64 oer_comp_offset; - u64 oer_comp_length; - u32 oer_iswrite; - u32 oer_errno; -}; - -/* OSD XDR Client API */ -/* Layout helpers */ -/* Layout decoding is done in two parts: - * 1. First Call pnfs_osd_xdr_decode_layout_map to read in only the header part - * of the layout. @iter members need not be initialized. - * Returned: - * @layout members are set. (@layout->olo_comps set to NULL). - * - * Zero on success, or negative error if passed xdr is broken. - * - * 2. 2nd Call pnfs_osd_xdr_decode_layout_comp() in a loop until it returns - * false, to decode the next component. - * Returned: - * true if there is more to decode or false if we are done or error. - * - * Example: - * struct pnfs_osd_xdr_decode_layout_iter iter; - * struct pnfs_osd_layout layout; - * struct pnfs_osd_object_cred comp; - * int status; - * - * status = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); - * if (unlikely(status)) - * goto err; - * while(pnfs_osd_xdr_decode_layout_comp(&comp, &iter, xdr, &status)) { - * // All of @comp strings point to inside the xdr_buffer - * // or scrach buffer. Copy them out to user memory eg. - * copy_single_comp(dest_comp++, &comp); - * } - * if (unlikely(status)) - * goto err; - */ - -struct pnfs_osd_xdr_decode_layout_iter { - unsigned total_comps; - unsigned decoded_comps; -}; - -extern int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, - struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr); - -extern bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp, - struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr, - int *err); - -/* Device Info helpers */ - -/* Note: All strings inside @deviceaddr point to space inside @p. - * @p should stay valid while @deviceaddr is in use. - */ -extern void pnfs_osd_xdr_decode_deviceaddr( - struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p); - -/* layoutupdate (layout_commit) xdr helpers */ -extern int -pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, - struct pnfs_osd_layoutupdate *lou); - -/* osd_ioerror encoding (layout_return) */ -extern __be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr); -extern void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr); - -#endif /* __PNFS_OSD_XDR_H__ */ -- cgit v1.2.3 From 92f62485b3715882cd397b0cbd80a96d179b86d6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 2 Nov 2021 21:31:22 +0200 Subject: net: dsa: felix: fix broken VLAN-tagged PTP under VLAN-aware bridge Normally it is expected that the dsa_device_ops :: rcv() method finishes parsing the DSA tag and consumes it, then never looks at it again. But commit c0bcf537667c ("net: dsa: ocelot: add hardware timestamping support for Felix") added support for RX timestamping in a very unconventional way. On this switch, a partial timestamp is available in the DSA header, but the driver got away with not parsing that timestamp right away, but instead delayed that parsing for a little longer: dsa_switch_rcv(): nskb = cpu_dp->rcv(skb, dev); <------------- not here -> ocelot_rcv() ... skb = nskb; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); ... if (dsa_skb_defer_rx_timestamp(p, skb)) <--- but here -> felix_rxtstamp() return 0; When in felix_rxtstamp(), this driver accounted for the fact that eth_type_trans() happened in the meanwhile, so it got a hold of the extraction header again by subtracting (ETH_HLEN + OCELOT_TAG_LEN) bytes from the current skb->data. This worked for quite some time but was quite fragile from the very beginning. Not to mention that having DSA tag parsing split in two different files, under different folders (net/dsa/tag_ocelot.c vs drivers/net/dsa/ocelot/felix.c) made it quite non-obvious for patches to come that they might break this. Finally, the blamed commit does the following: at the end of ocelot_rcv(), it checks whether the skb payload contains a VLAN header. If it does, and this port is under a VLAN-aware bridge, that VLAN ID might not be correct in the sense that the packet might have suffered VLAN rewriting due to TCAM rules (VCAP IS1). So we consume the VLAN ID from the skb payload using __skb_vlan_pop(), and take the classified VLAN ID from the DSA tag, and construct a hwaccel VLAN tag with the classified VLAN, and the skb payload is VLAN-untagged. The big problem is that __skb_vlan_pop() does: memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); __skb_pull(skb, VLAN_HLEN); aka it moves the Ethernet header 4 bytes to the right, and pulls 4 bytes from the skb headroom (effectively also moving skb->data, by definition). So for felix_rxtstamp()'s fragile logic, all bets are off now. Instead of having the "extraction" pointer point to the DSA header, it actually points to 4 bytes _inside_ the extraction header. Corollary, the last 4 bytes of the "extraction" header are in fact 4 stale bytes of the destination MAC address from the Ethernet header, from prior to the __skb_vlan_pop() movement. So of course, RX timestamps are completely bogus when the system is configured in this way. The fix is actually very simple: just don't structure the code like that. For better or worse, the DSA PTP timestamping API does not offer a straightforward way for drivers to present their RX timestamps, but other drivers (sja1105) have established a simple mechanism to carry their RX timestamp from dsa_device_ops :: rcv() all the way to dsa_switch_ops :: port_rxtstamp() and even later. That mechanism is to simply save the partial timestamp to the skb->cb, and complete it later. Question: why don't we simply populate the skb's struct skb_shared_hwtstamps from ocelot_rcv(), and bother with this complication of propagating the timestamp to felix_rxtstamp()? Answer: dsa_switch_ops :: port_rxtstamp() answers the question whether PTP packets need sleepable context to retrieve the full RX timestamp. Currently felix_rxtstamp() answers "no, thanks" to that question, and calls ocelot_ptp_gettime64() from softirq atomic context. This is understandable, since Felix VSC9959 is a PCIe memory-mapped switch, so hardware access does not require sleeping. But the felix driver is preparing for the introduction of other switches where hardware access is over a slow bus like SPI or MDIO: https://lore.kernel.org/lkml/20210814025003.2449143-1-colin.foster@in-advantage.com/ So I would like to keep this code structure, so the rework needed when that driver will need PTP support will be minimal (answer "yes, I need deferred context for this skb's RX timestamp", then the partial timestamp will still be found in the skb->cb. Fixes: ea440cd2d9b2 ("net: dsa: tag_ocelot: use VLAN information from tagging header when available") Reported-by: Po Liu Cc: Yangbo Lu Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/ocelot.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index d42010cf5468..7ee708ad7df2 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -12,6 +12,7 @@ struct ocelot_skb_cb { struct sk_buff *clone; unsigned int ptp_class; /* valid only for clones */ + u32 tstamp_lo; u8 ptp_cmd; u8 ts_id; }; -- cgit v1.2.3 From 6429e46304ac7820eebbea2bf5d73b90c18e0e06 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 28 Oct 2021 10:47:21 +0100 Subject: libfs: Move shmem_exchange to simple_rename_exchange Move shmem_exchange and make it available to other callers. Suggested-by: Miklos Szeredi Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Acked-by: Miklos Szeredi Cc: Al Viro Cc: Christian Brauner Cc: Greg Kroah-Hartman Link: https://lore.kernel.org/bpf/20211028094724.59043-2-lmb@cloudflare.com --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index f3cfca5edc9a..bc4e97b82ddd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3383,6 +3383,8 @@ extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); +extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename(struct user_namespace *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); -- cgit v1.2.3 From b18c1ad685d9a6af5189d4cc037a653270a07b4c Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 18 Oct 2021 15:17:25 +0300 Subject: i2c: Allow an ACPI driver to manage the device's power state during probe Enable drivers to tell ACPI that there's no need to power on a device for probe. Drivers should still perform this by themselves if there's a need to. In some cases powering on the device during probe is undesirable, and this change enables a driver to choose what fits best for it. Add a field called "flags" into struct i2c_driver for driver flags, and a flag I2C_DRV_ACPI_WAIVE_D0_PROBE to tell a driver supports probe in ACPI D states other than 0. Signed-off-by: Sakari Ailus Reviewed-by: Tomasz Figa Acked-by: Wolfram Sang Signed-off-by: Rafael J. Wysocki --- include/linux/i2c.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 2ce3efbe9198..16119ac1aa97 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -11,6 +11,7 @@ #define _LINUX_I2C_H #include /* for acpi_handle */ +#include #include #include /* for struct device */ #include /* for completion */ @@ -222,6 +223,15 @@ enum i2c_alert_protocol { I2C_PROTOCOL_SMBUS_HOST_NOTIFY, }; +/** + * enum i2c_driver_flags - Flags for an I2C device driver + * + * @I2C_DRV_ACPI_WAIVE_D0_PROBE: Don't put the device in D0 state for probe + */ +enum i2c_driver_flags { + I2C_DRV_ACPI_WAIVE_D0_PROBE = BIT(0), +}; + /** * struct i2c_driver - represent an I2C device driver * @class: What kind of i2c device we instantiate (for detect) @@ -236,6 +246,7 @@ enum i2c_alert_protocol { * @detect: Callback for device detection * @address_list: The I2C addresses to probe (for detect) * @clients: List of detected clients we created (for i2c-core use only) + * @flags: A bitmask of flags defined in &enum i2c_driver_flags * * The driver.owner field should be set to the module owner of this driver. * The driver.name field should be set to the name of this driver. @@ -294,6 +305,8 @@ struct i2c_driver { int (*detect)(struct i2c_client *client, struct i2c_board_info *info); const unsigned short *address_list; struct list_head clients; + + u32 flags; }; #define to_i2c_driver(d) container_of(d, struct i2c_driver, driver) @@ -1015,6 +1028,7 @@ u32 i2c_acpi_find_bus_speed(struct device *dev); struct i2c_client *i2c_acpi_new_device(struct device *dev, int index, struct i2c_board_info *info); struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle); +bool i2c_acpi_waive_d0_probe(struct device *dev); #else static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c) @@ -1038,6 +1052,10 @@ static inline struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle ha { return NULL; } +static inline bool i2c_acpi_waive_d0_probe(struct device *dev) +{ + return false; +} #endif /* CONFIG_ACPI */ #endif /* _LINUX_I2C_H */ -- cgit v1.2.3 From b82a7df4a7f3841896aaec1ad81e654bc87b5989 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 18 Oct 2021 15:17:27 +0300 Subject: ACPI: Add a convenience function to tell a device is in D0 state Add a convenience function to tell whether a device is in D0 state, primarily for use in drivers' probe or remove functions on busses where the custom is to power on the device for the duration of both. Returns false on non-ACPI systems. Suggested-by: Mika Westerberg Signed-off-by: Sakari Ailus Reviewed-by: Tomasz Figa Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index fbc2146050a4..df70fc27afd7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1016,6 +1016,7 @@ int acpi_subsys_runtime_suspend(struct device *dev); int acpi_subsys_runtime_resume(struct device *dev); int acpi_dev_pm_attach(struct device *dev, bool power_on); bool acpi_storage_d3(struct device *dev); +bool acpi_dev_state_d0(struct device *dev); #else static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; } @@ -1027,6 +1028,10 @@ static inline bool acpi_storage_d3(struct device *dev) { return false; } +static inline bool acpi_dev_state_d0(struct device *dev) +{ + return true; +} #endif #if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP) -- cgit v1.2.3 From 5c4e0a21fae877a7ef89be6dcc6263ec672372b8 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 2 Nov 2021 07:24:20 -0700 Subject: string: uninline memcpy_and_pad When building m68k:allmodconfig, recent versions of gcc generate the following error if the length of UTS_RELEASE is less than 8 bytes. In function 'memcpy_and_pad', inlined from 'nvmet_execute_disc_identify' at drivers/nvme/target/discovery.c:268:2: arch/m68k/include/asm/string.h:72:25: error: '__builtin_memcpy' reading 8 bytes from a region of size 7 Discussions around the problem suggest that this only happens if an architecture does not provide strlen(), if -ffreestanding is provided as compiler option, and if CONFIG_FORTIFY_SOURCE=n. All of this is the case for m68k. The exact reasons are unknown, but seem to be related to the ability of the compiler to evaluate the return value of strlen() and the resulting execution flow in memcpy_and_pad(). It would be possible to work around the problem by using sizeof(UTS_RELEASE) instead of strlen(UTS_RELEASE), but that would only postpone the problem until the function is called in a similar way. Uninline memcpy_and_pad() instead to solve the problem for good. Suggested-by: Linus Torvalds Reviewed-by: Geert Uytterhoeven Acked-by: Andy Shevchenko Signed-off-by: Guenter Roeck Signed-off-by: Linus Torvalds --- include/linux/string.h | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 5a36608144a9..b6572aeca2f5 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -253,23 +253,8 @@ static inline const char *kbasename(const char *path) #include #endif -/** - * memcpy_and_pad - Copy one buffer to another with padding - * @dest: Where to copy to - * @dest_len: The destination buffer size - * @src: Where to copy from - * @count: The number of bytes to copy - * @pad: Character to use for padding if space is left in destination. - */ -static inline void memcpy_and_pad(void *dest, size_t dest_len, - const void *src, size_t count, int pad) -{ - if (dest_len > count) { - memcpy(dest, src, count); - memset(dest + count, pad, dest_len - count); - } else - memcpy(dest, src, dest_len); -} +void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, + int pad); /** * memset_after - Set a value after a struct member to the end of a struct -- cgit v1.2.3 From 00b06da29cf9dc633cdba87acd3f57f4df3fd5c7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Oct 2021 09:14:19 -0500 Subject: signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed As Andy pointed out that there are races between force_sig_info_to_task and sigaction[1] when force_sig_info_task. As Kees discovered[2] ptrace is also able to change these signals. In the case of seeccomp killing a process with a signal it is a security violation to allow the signal to be caught or manipulated. Solve this problem by introducing a new flag SA_IMMUTABLE that prevents sigaction and ptrace from modifying these forced signals. This flag is carefully made kernel internal so that no new ABI is introduced. Longer term I think this can be solved by guaranteeing short circuit delivery of signals in this case. Unfortunately reliable and guaranteed short circuit delivery of these signals is still a ways off from being implemented, tested, and merged. So I have implemented a much simpler alternative for now. [1] https://lkml.kernel.org/r/b5d52d25-7bde-4030-a7b1-7c6f8ab90660@www.fastmail.com [2] https://lkml.kernel.org/r/202110281136.5CE65399A7@keescook Cc: stable@vger.kernel.org Fixes: 307d522f5eb8 ("signal/seccomp: Refactor seccomp signal and coredump generation") Tested-by: Andrea Righi Tested-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- include/linux/signal_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h index 34cb28b8f16c..a70b2bdbf4d9 100644 --- a/include/linux/signal_types.h +++ b/include/linux/signal_types.h @@ -70,6 +70,9 @@ struct ksignal { int sig; }; +/* Used to kill the race between sigaction and forced signals */ +#define SA_IMMUTABLE 0x00800000 + #ifndef __ARCH_UAPI_SA_FLAGS #ifdef SA_RESTORER #define __ARCH_UAPI_SA_FLAGS SA_RESTORER -- cgit v1.2.3 From 2116274af46b12df7cea1dc5698f3cf2f231f8a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Nov 2021 18:20:37 +0100 Subject: block: add a loff_t cast to bdev_nr_bytes Not really needed as both loff_t and sector_t are always 64-bits wide, but this documents the different types a bit better. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211104172037.531803-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 900325aa5d31..1dce769c4596 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -243,7 +243,7 @@ static inline sector_t bdev_nr_sectors(struct block_device *bdev) static inline loff_t bdev_nr_bytes(struct block_device *bdev) { - return bdev_nr_sectors(bdev) << SECTOR_SHIFT; + return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT; } static inline sector_t get_capacity(struct gendisk *disk) -- cgit v1.2.3 From 0ab8d0f6ae3f1234e38eaedc3ff7c22bfdf7f78c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 29 Sep 2021 17:38:34 +0100 Subject: irqdomain: Make of_phandle_args_to_fwspec() generally available of_phandle_args_to_fwspec() can be generally useful to code extracting a DT of_phandle and using an irq_fwspec to use the hierarchical irqdomain API. Make it visible to the rest of the kernel, including modules. Link: https://lore.kernel.org/r/20210929163847.2807812-2-maz@kernel.org Tested-by: Alyssa Rosenzweig Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas --- include/linux/irqdomain.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 23e4ee523576..cfd442316f39 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -64,6 +64,10 @@ struct irq_fwspec { u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS]; }; +/* Conversion function from of_phandle_args fields to fwspec */ +void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, + unsigned int count, struct irq_fwspec *fwspec); + /* * Should several domains have the same device node, but serve * different purposes (for example one domain is for PCI/MSI, and the -- cgit v1.2.3 From 439b08c57c3fe1df85cfe9d00accdf9b62cb3275 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 3 Nov 2021 16:51:36 +0100 Subject: Revert "usb: core: hcd: Add support for deferring roothub registration" This reverts commit 58877b0824da15698bd85a0a9dbfa8c354e6ecb7. It has been reported to be causing problems in Arch and Fedora bug reports. Reported-by: Hans de Goede Link: https://bbs.archlinux.org/viewtopic.php?pid=2000956#p2000956 Link: https://bugzilla.redhat.com/show_bug.cgi?id=2019542 Link: https://bugzilla.redhat.com/show_bug.cgi?id=2019576 Link: https://lore.kernel.org/r/42bcbea6-5eb8-16c7-336a-2cb72e71bc36@redhat.com Cc: Mathias Nyman Cc: Chris Chiu Cc: Alan Stern Cc: Kishon Vijay Abraham I Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/hcd.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 2c1fc9212cf2..548a028f2dab 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -124,7 +124,6 @@ struct usb_hcd { #define HCD_FLAG_RH_RUNNING 5 /* root hub is running? */ #define HCD_FLAG_DEAD 6 /* controller has died? */ #define HCD_FLAG_INTF_AUTHORIZED 7 /* authorize interfaces? */ -#define HCD_FLAG_DEFER_RH_REGISTER 8 /* Defer roothub registration */ /* The flags can be tested using these macros; they are likely to * be slightly faster than test_bit(). @@ -135,7 +134,6 @@ struct usb_hcd { #define HCD_WAKEUP_PENDING(hcd) ((hcd)->flags & (1U << HCD_FLAG_WAKEUP_PENDING)) #define HCD_RH_RUNNING(hcd) ((hcd)->flags & (1U << HCD_FLAG_RH_RUNNING)) #define HCD_DEAD(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEAD)) -#define HCD_DEFER_RH_REGISTER(hcd) ((hcd)->flags & (1U << HCD_FLAG_DEFER_RH_REGISTER)) /* * Specifies if interfaces are authorized by default -- cgit v1.2.3 From 27d9a4d69433af1827a764fe235866d5d5501fdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 9 Sep 2021 11:48:48 +0200 Subject: pwm: Add might_sleep() annotations for !CONFIG_PWM API functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The normal implementations of these functions make use of mutexes. To make it obvious that these functions might sleep also add annotations to the dummy implementations in the !CONFIG_PWM case. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 725c9b784e60..515e33978e97 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -429,11 +429,13 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev, #else static inline struct pwm_device *pwm_request(int pwm_id, const char *label) { + might_sleep(); return ERR_PTR(-ENODEV); } static inline void pwm_free(struct pwm_device *pwm) { + might_sleep(); } static inline int pwm_apply_state(struct pwm_device *pwm, @@ -493,12 +495,14 @@ static inline struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip, unsigned int index, const char *label) { + might_sleep(); return ERR_PTR(-ENODEV); } static inline struct pwm_device *pwm_get(struct device *dev, const char *consumer) { + might_sleep(); return ERR_PTR(-ENODEV); } @@ -506,16 +510,19 @@ static inline struct pwm_device *of_pwm_get(struct device *dev, struct device_node *np, const char *con_id) { + might_sleep(); return ERR_PTR(-ENODEV); } static inline void pwm_put(struct pwm_device *pwm) { + might_sleep(); } static inline struct pwm_device *devm_pwm_get(struct device *dev, const char *consumer) { + might_sleep(); return ERR_PTR(-ENODEV); } @@ -523,6 +530,7 @@ static inline struct pwm_device *devm_of_pwm_get(struct device *dev, struct device_node *np, const char *con_id) { + might_sleep(); return ERR_PTR(-ENODEV); } @@ -530,6 +538,7 @@ static inline struct pwm_device * devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode, const char *con_id) { + might_sleep(); return ERR_PTR(-ENODEV); } #endif -- cgit v1.2.3 From 4ad91a227817ae48f931595d1101fc7100073ce9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 9 Sep 2021 11:48:49 +0200 Subject: pwm: Make it explicit that pwm_apply_state() might sleep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At least some implementations sleep. So mark pwm_apply_state() with a might_sleep() to make callers aware. In the worst case this uncovers a valid atomic user, then we revert this patch and at least gained some more knowledge and then can work on a concept similar to gpio_get_value/gpio_get_value_cansleep. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 515e33978e97..e6dac95e4960 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -441,6 +441,7 @@ static inline void pwm_free(struct pwm_device *pwm) static inline int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state) { + might_sleep(); return -ENOTSUPP; } @@ -452,6 +453,7 @@ static inline int pwm_adjust_config(struct pwm_device *pwm) static inline int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns) { + might_sleep(); return -EINVAL; } @@ -464,11 +466,13 @@ static inline int pwm_capture(struct pwm_device *pwm, static inline int pwm_enable(struct pwm_device *pwm) { + might_sleep(); return -EINVAL; } static inline void pwm_disable(struct pwm_device *pwm) { + might_sleep(); } static inline int pwm_set_chip_data(struct pwm_device *pwm, void *data) -- cgit v1.2.3 From c9a20383578abd8f7fb8ba88f4c6d25b47924c34 Mon Sep 17 00:00:00 2001 From: Carlos de Paula Date: Mon, 30 Aug 2021 16:53:45 -0300 Subject: mfd: da9063: Add support for latest EA silicon revision This update adds new regmap to support the latest EA silicon which will be selected based on the chip and variant information read from the device. Signed-off-by: Carlos de Paula Reviewed-by: Adam Thomson Signed-off-by: Lee Jones --- include/linux/mfd/da9063/core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/da9063/core.h b/include/linux/mfd/da9063/core.h index fa7a43f02f27..8db52324f416 100644 --- a/include/linux/mfd/da9063/core.h +++ b/include/linux/mfd/da9063/core.h @@ -36,6 +36,7 @@ enum da9063_variant_codes { PMIC_DA9063_BB = 0x5, PMIC_DA9063_CA = 0x6, PMIC_DA9063_DA = 0x7, + PMIC_DA9063_EA = 0x8, }; /* Interrupts */ -- cgit v1.2.3 From ec14d90dee8ec6960324ae9f1116103efcde8a52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 12 Oct 2021 17:39:35 +0200 Subject: mfd: tps65912: Make tps65912_device_exit() return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Up to now tps65912_device_exit() returns zero unconditionally. Make it return void instead which makes it easier to see in the callers that there is no error to handle. Also the return value of i2c and spi remove callbacks is ignored anyway. Signed-off-by: Uwe Kleine-König Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211012153945.2651412-11-u.kleine-koenig@pengutronix.de --- include/linux/mfd/tps65912.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tps65912.h b/include/linux/mfd/tps65912.h index 7943e413deae..8a61386cb8c1 100644 --- a/include/linux/mfd/tps65912.h +++ b/include/linux/mfd/tps65912.h @@ -322,6 +322,6 @@ struct tps65912 { extern const struct regmap_config tps65912_regmap_config; int tps65912_device_init(struct tps65912 *tps); -int tps65912_device_exit(struct tps65912 *tps); +void tps65912_device_exit(struct tps65912 *tps); #endif /* __LINUX_MFD_TPS65912_H */ -- cgit v1.2.3 From 0cee0416563d7cac807c8f092941f3e37ede05db Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Tue, 19 Oct 2021 16:59:11 +0200 Subject: mfd: max77686: Correct tab-based alignment of register addresses Some lines have an extra tab, remove them for proper visual alignment as present on the rest of this file. Signed-off-by: Luca Ceresoli Reviewed-by: Krzysztof Kozlowski Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211019145919.7327-2-luca@lucaceresoli.net --- include/linux/mfd/max77686-private.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max77686-private.h b/include/linux/mfd/max77686-private.h index 833e578e051e..b1482b3cf353 100644 --- a/include/linux/mfd/max77686-private.h +++ b/include/linux/mfd/max77686-private.h @@ -133,35 +133,35 @@ enum max77686_pmic_reg { /* Reserved: 0x7A-0x7D */ MAX77686_REG_BBAT_CHG = 0x7E, - MAX77686_REG_32KHZ = 0x7F, + MAX77686_REG_32KHZ = 0x7F, MAX77686_REG_PMIC_END = 0x80, }; enum max77686_rtc_reg { - MAX77686_RTC_INT = 0x00, - MAX77686_RTC_INTM = 0x01, + MAX77686_RTC_INT = 0x00, + MAX77686_RTC_INTM = 0x01, MAX77686_RTC_CONTROLM = 0x02, MAX77686_RTC_CONTROL = 0x03, MAX77686_RTC_UPDATE0 = 0x04, /* Reserved: 0x5 */ MAX77686_WTSR_SMPL_CNTL = 0x06, - MAX77686_RTC_SEC = 0x07, - MAX77686_RTC_MIN = 0x08, - MAX77686_RTC_HOUR = 0x09, + MAX77686_RTC_SEC = 0x07, + MAX77686_RTC_MIN = 0x08, + MAX77686_RTC_HOUR = 0x09, MAX77686_RTC_WEEKDAY = 0x0A, - MAX77686_RTC_MONTH = 0x0B, - MAX77686_RTC_YEAR = 0x0C, - MAX77686_RTC_DATE = 0x0D, - MAX77686_ALARM1_SEC = 0x0E, - MAX77686_ALARM1_MIN = 0x0F, + MAX77686_RTC_MONTH = 0x0B, + MAX77686_RTC_YEAR = 0x0C, + MAX77686_RTC_DATE = 0x0D, + MAX77686_ALARM1_SEC = 0x0E, + MAX77686_ALARM1_MIN = 0x0F, MAX77686_ALARM1_HOUR = 0x10, MAX77686_ALARM1_WEEKDAY = 0x11, MAX77686_ALARM1_MONTH = 0x12, MAX77686_ALARM1_YEAR = 0x13, MAX77686_ALARM1_DATE = 0x14, - MAX77686_ALARM2_SEC = 0x15, - MAX77686_ALARM2_MIN = 0x16, + MAX77686_ALARM2_SEC = 0x15, + MAX77686_ALARM2_MIN = 0x16, MAX77686_ALARM2_HOUR = 0x17, MAX77686_ALARM2_WEEKDAY = 0x18, MAX77686_ALARM2_MONTH = 0x19, -- cgit v1.2.3 From b20cd02f7fef68ae395d9df0a9fb9edcf414b5a2 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 21 Oct 2021 22:22:58 +0300 Subject: mfd: tps80031: Remove driver Driver was upstreamed in 2013 and never got a user, remove it. Signed-off-by: Dmitry Osipenko Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20211021192258.21968-4-digetx@gmail.com --- include/linux/mfd/tps80031.h | 637 ------------------------------------------- 1 file changed, 637 deletions(-) delete mode 100644 include/linux/mfd/tps80031.h (limited to 'include/linux') diff --git a/include/linux/mfd/tps80031.h b/include/linux/mfd/tps80031.h deleted file mode 100644 index 2c75c9c9318f..000000000000 --- a/include/linux/mfd/tps80031.h +++ /dev/null @@ -1,637 +0,0 @@ -/* - * tps80031.h -- TI TPS80031 and TI TPS80032 PMIC driver. - * - * Copyright (c) 2012, NVIDIA Corporation. - * - * Author: Laxman Dewangan - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation version 2. - * - * This program is distributed "as is" WITHOUT ANY WARRANTY of any kind, - * whether express or implied; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - * 02111-1307, USA - */ - -#ifndef __LINUX_MFD_TPS80031_H -#define __LINUX_MFD_TPS80031_H - -#include -#include - -/* Pull-ups/Pull-downs */ -#define TPS80031_CFG_INPUT_PUPD1 0xF0 -#define TPS80031_CFG_INPUT_PUPD2 0xF1 -#define TPS80031_CFG_INPUT_PUPD3 0xF2 -#define TPS80031_CFG_INPUT_PUPD4 0xF3 -#define TPS80031_CFG_LDO_PD1 0xF4 -#define TPS80031_CFG_LDO_PD2 0xF5 -#define TPS80031_CFG_SMPS_PD 0xF6 - -/* Real Time Clock */ -#define TPS80031_SECONDS_REG 0x00 -#define TPS80031_MINUTES_REG 0x01 -#define TPS80031_HOURS_REG 0x02 -#define TPS80031_DAYS_REG 0x03 -#define TPS80031_MONTHS_REG 0x04 -#define TPS80031_YEARS_REG 0x05 -#define TPS80031_WEEKS_REG 0x06 -#define TPS80031_ALARM_SECONDS_REG 0x08 -#define TPS80031_ALARM_MINUTES_REG 0x09 -#define TPS80031_ALARM_HOURS_REG 0x0A -#define TPS80031_ALARM_DAYS_REG 0x0B -#define TPS80031_ALARM_MONTHS_REG 0x0C -#define TPS80031_ALARM_YEARS_REG 0x0D -#define TPS80031_RTC_CTRL_REG 0x10 -#define TPS80031_RTC_STATUS_REG 0x11 -#define TPS80031_RTC_INTERRUPTS_REG 0x12 -#define TPS80031_RTC_COMP_LSB_REG 0x13 -#define TPS80031_RTC_COMP_MSB_REG 0x14 -#define TPS80031_RTC_RESET_STATUS_REG 0x16 - -/*PMC Master Module */ -#define TPS80031_PHOENIX_START_CONDITION 0x1F -#define TPS80031_PHOENIX_MSK_TRANSITION 0x20 -#define TPS80031_STS_HW_CONDITIONS 0x21 -#define TPS80031_PHOENIX_LAST_TURNOFF_STS 0x22 -#define TPS80031_VSYSMIN_LO_THRESHOLD 0x23 -#define TPS80031_VSYSMIN_HI_THRESHOLD 0x24 -#define TPS80031_PHOENIX_DEV_ON 0x25 -#define TPS80031_STS_PWR_GRP_STATE 0x27 -#define TPS80031_PH_CFG_VSYSLOW 0x28 -#define TPS80031_PH_STS_BOOT 0x29 -#define TPS80031_PHOENIX_SENS_TRANSITION 0x2A -#define TPS80031_PHOENIX_SEQ_CFG 0x2B -#define TPS80031_PRIMARY_WATCHDOG_CFG 0X2C -#define TPS80031_KEY_PRESS_DUR_CFG 0X2D -#define TPS80031_SMPS_LDO_SHORT_STS 0x2E - -/* PMC Slave Module - Broadcast */ -#define TPS80031_BROADCAST_ADDR_ALL 0x31 -#define TPS80031_BROADCAST_ADDR_REF 0x32 -#define TPS80031_BROADCAST_ADDR_PROV 0x33 -#define TPS80031_BROADCAST_ADDR_CLK_RST 0x34 - -/* PMC Slave Module SMPS Regulators */ -#define TPS80031_SMPS4_CFG_TRANS 0x41 -#define TPS80031_SMPS4_CFG_STATE 0x42 -#define TPS80031_SMPS4_CFG_VOLTAGE 0x44 -#define TPS80031_VIO_CFG_TRANS 0x47 -#define TPS80031_VIO_CFG_STATE 0x48 -#define TPS80031_VIO_CFG_FORCE 0x49 -#define TPS80031_VIO_CFG_VOLTAGE 0x4A -#define TPS80031_VIO_CFG_STEP 0x48 -#define TPS80031_SMPS1_CFG_TRANS 0x53 -#define TPS80031_SMPS1_CFG_STATE 0x54 -#define TPS80031_SMPS1_CFG_FORCE 0x55 -#define TPS80031_SMPS1_CFG_VOLTAGE 0x56 -#define TPS80031_SMPS1_CFG_STEP 0x57 -#define TPS80031_SMPS2_CFG_TRANS 0x59 -#define TPS80031_SMPS2_CFG_STATE 0x5A -#define TPS80031_SMPS2_CFG_FORCE 0x5B -#define TPS80031_SMPS2_CFG_VOLTAGE 0x5C -#define TPS80031_SMPS2_CFG_STEP 0x5D -#define TPS80031_SMPS3_CFG_TRANS 0x65 -#define TPS80031_SMPS3_CFG_STATE 0x66 -#define TPS80031_SMPS3_CFG_VOLTAGE 0x68 - -/* PMC Slave Module LDO Regulators */ -#define TPS80031_VANA_CFG_TRANS 0x81 -#define TPS80031_VANA_CFG_STATE 0x82 -#define TPS80031_VANA_CFG_VOLTAGE 0x83 -#define TPS80031_LDO2_CFG_TRANS 0x85 -#define TPS80031_LDO2_CFG_STATE 0x86 -#define TPS80031_LDO2_CFG_VOLTAGE 0x87 -#define TPS80031_LDO4_CFG_TRANS 0x89 -#define TPS80031_LDO4_CFG_STATE 0x8A -#define TPS80031_LDO4_CFG_VOLTAGE 0x8B -#define TPS80031_LDO3_CFG_TRANS 0x8D -#define TPS80031_LDO3_CFG_STATE 0x8E -#define TPS80031_LDO3_CFG_VOLTAGE 0x8F -#define TPS80031_LDO6_CFG_TRANS 0x91 -#define TPS80031_LDO6_CFG_STATE 0x92 -#define TPS80031_LDO6_CFG_VOLTAGE 0x93 -#define TPS80031_LDOLN_CFG_TRANS 0x95 -#define TPS80031_LDOLN_CFG_STATE 0x96 -#define TPS80031_LDOLN_CFG_VOLTAGE 0x97 -#define TPS80031_LDO5_CFG_TRANS 0x99 -#define TPS80031_LDO5_CFG_STATE 0x9A -#define TPS80031_LDO5_CFG_VOLTAGE 0x9B -#define TPS80031_LDO1_CFG_TRANS 0x9D -#define TPS80031_LDO1_CFG_STATE 0x9E -#define TPS80031_LDO1_CFG_VOLTAGE 0x9F -#define TPS80031_LDOUSB_CFG_TRANS 0xA1 -#define TPS80031_LDOUSB_CFG_STATE 0xA2 -#define TPS80031_LDOUSB_CFG_VOLTAGE 0xA3 -#define TPS80031_LDO7_CFG_TRANS 0xA5 -#define TPS80031_LDO7_CFG_STATE 0xA6 -#define TPS80031_LDO7_CFG_VOLTAGE 0xA7 - -/* PMC Slave Module External Control */ -#define TPS80031_REGEN1_CFG_TRANS 0xAE -#define TPS80031_REGEN1_CFG_STATE 0xAF -#define TPS80031_REGEN2_CFG_TRANS 0xB1 -#define TPS80031_REGEN2_CFG_STATE 0xB2 -#define TPS80031_SYSEN_CFG_TRANS 0xB4 -#define TPS80031_SYSEN_CFG_STATE 0xB5 - -/* PMC Slave Module Internal Control */ -#define TPS80031_NRESPWRON_CFG_TRANS 0xB7 -#define TPS80031_NRESPWRON_CFG_STATE 0xB8 -#define TPS80031_CLK32KAO_CFG_TRANS 0xBA -#define TPS80031_CLK32KAO_CFG_STATE 0xBB -#define TPS80031_CLK32KG_CFG_TRANS 0xBD -#define TPS80031_CLK32KG_CFG_STATE 0xBE -#define TPS80031_CLK32KAUDIO_CFG_TRANS 0xC0 -#define TPS80031_CLK32KAUDIO_CFG_STATE 0xC1 -#define TPS80031_VRTC_CFG_TRANS 0xC3 -#define TPS80031_VRTC_CFG_STATE 0xC4 -#define TPS80031_BIAS_CFG_TRANS 0xC6 -#define TPS80031_BIAS_CFG_STATE 0xC7 -#define TPS80031_VSYSMIN_HI_CFG_TRANS 0xC9 -#define TPS80031_VSYSMIN_HI_CFG_STATE 0xCA -#define TPS80031_RC6MHZ_CFG_TRANS 0xCC -#define TPS80031_RC6MHZ_CFG_STATE 0xCD -#define TPS80031_TMP_CFG_TRANS 0xCF -#define TPS80031_TMP_CFG_STATE 0xD0 - -/* PMC Slave Module resources assignment */ -#define TPS80031_PREQ1_RES_ASS_A 0xD7 -#define TPS80031_PREQ1_RES_ASS_B 0xD8 -#define TPS80031_PREQ1_RES_ASS_C 0xD9 -#define TPS80031_PREQ2_RES_ASS_A 0xDA -#define TPS80031_PREQ2_RES_ASS_B 0xDB -#define TPS80031_PREQ2_RES_ASS_C 0xDC -#define TPS80031_PREQ3_RES_ASS_A 0xDD -#define TPS80031_PREQ3_RES_ASS_B 0xDE -#define TPS80031_PREQ3_RES_ASS_C 0xDF - -/* PMC Slave Module Miscellaneous */ -#define TPS80031_SMPS_OFFSET 0xE0 -#define TPS80031_SMPS_MULT 0xE3 -#define TPS80031_MISC1 0xE4 -#define TPS80031_MISC2 0xE5 -#define TPS80031_BBSPOR_CFG 0xE6 -#define TPS80031_TMP_CFG 0xE7 - -/* Battery Charging Controller and Indicator LED */ -#define TPS80031_CONTROLLER_CTRL2 0xDA -#define TPS80031_CONTROLLER_VSEL_COMP 0xDB -#define TPS80031_CHARGERUSB_VSYSREG 0xDC -#define TPS80031_CHARGERUSB_VICHRG_PC 0xDD -#define TPS80031_LINEAR_CHRG_STS 0xDE -#define TPS80031_CONTROLLER_INT_MASK 0xE0 -#define TPS80031_CONTROLLER_CTRL1 0xE1 -#define TPS80031_CONTROLLER_WDG 0xE2 -#define TPS80031_CONTROLLER_STAT1 0xE3 -#define TPS80031_CHARGERUSB_INT_STATUS 0xE4 -#define TPS80031_CHARGERUSB_INT_MASK 0xE5 -#define TPS80031_CHARGERUSB_STATUS_INT1 0xE6 -#define TPS80031_CHARGERUSB_STATUS_INT2 0xE7 -#define TPS80031_CHARGERUSB_CTRL1 0xE8 -#define TPS80031_CHARGERUSB_CTRL2 0xE9 -#define TPS80031_CHARGERUSB_CTRL3 0xEA -#define TPS80031_CHARGERUSB_STAT1 0xEB -#define TPS80031_CHARGERUSB_VOREG 0xEC -#define TPS80031_CHARGERUSB_VICHRG 0xED -#define TPS80031_CHARGERUSB_CINLIMIT 0xEE -#define TPS80031_CHARGERUSB_CTRLLIMIT1 0xEF -#define TPS80031_CHARGERUSB_CTRLLIMIT2 0xF0 -#define TPS80031_LED_PWM_CTRL1 0xF4 -#define TPS80031_LED_PWM_CTRL2 0xF5 - -/* USB On-The-Go */ -#define TPS80031_BACKUP_REG 0xFA -#define TPS80031_USB_VENDOR_ID_LSB 0x00 -#define TPS80031_USB_VENDOR_ID_MSB 0x01 -#define TPS80031_USB_PRODUCT_ID_LSB 0x02 -#define TPS80031_USB_PRODUCT_ID_MSB 0x03 -#define TPS80031_USB_VBUS_CTRL_SET 0x04 -#define TPS80031_USB_VBUS_CTRL_CLR 0x05 -#define TPS80031_USB_ID_CTRL_SET 0x06 -#define TPS80031_USB_ID_CTRL_CLR 0x07 -#define TPS80031_USB_VBUS_INT_SRC 0x08 -#define TPS80031_USB_VBUS_INT_LATCH_SET 0x09 -#define TPS80031_USB_VBUS_INT_LATCH_CLR 0x0A -#define TPS80031_USB_VBUS_INT_EN_LO_SET 0x0B -#define TPS80031_USB_VBUS_INT_EN_LO_CLR 0x0C -#define TPS80031_USB_VBUS_INT_EN_HI_SET 0x0D -#define TPS80031_USB_VBUS_INT_EN_HI_CLR 0x0E -#define TPS80031_USB_ID_INT_SRC 0x0F -#define TPS80031_USB_ID_INT_LATCH_SET 0x10 -#define TPS80031_USB_ID_INT_LATCH_CLR 0x11 -#define TPS80031_USB_ID_INT_EN_LO_SET 0x12 -#define TPS80031_USB_ID_INT_EN_LO_CLR 0x13 -#define TPS80031_USB_ID_INT_EN_HI_SET 0x14 -#define TPS80031_USB_ID_INT_EN_HI_CLR 0x15 -#define TPS80031_USB_OTG_ADP_CTRL 0x16 -#define TPS80031_USB_OTG_ADP_HIGH 0x17 -#define TPS80031_USB_OTG_ADP_LOW 0x18 -#define TPS80031_USB_OTG_ADP_RISE 0x19 -#define TPS80031_USB_OTG_REVISION 0x1A - -/* Gas Gauge */ -#define TPS80031_FG_REG_00 0xC0 -#define TPS80031_FG_REG_01 0xC1 -#define TPS80031_FG_REG_02 0xC2 -#define TPS80031_FG_REG_03 0xC3 -#define TPS80031_FG_REG_04 0xC4 -#define TPS80031_FG_REG_05 0xC5 -#define TPS80031_FG_REG_06 0xC6 -#define TPS80031_FG_REG_07 0xC7 -#define TPS80031_FG_REG_08 0xC8 -#define TPS80031_FG_REG_09 0xC9 -#define TPS80031_FG_REG_10 0xCA -#define TPS80031_FG_REG_11 0xCB - -/* General Purpose ADC */ -#define TPS80031_GPADC_CTRL 0x2E -#define TPS80031_GPADC_CTRL2 0x2F -#define TPS80031_RTSELECT_LSB 0x32 -#define TPS80031_RTSELECT_ISB 0x33 -#define TPS80031_RTSELECT_MSB 0x34 -#define TPS80031_GPSELECT_ISB 0x35 -#define TPS80031_CTRL_P1 0x36 -#define TPS80031_RTCH0_LSB 0x37 -#define TPS80031_RTCH0_MSB 0x38 -#define TPS80031_RTCH1_LSB 0x39 -#define TPS80031_RTCH1_MSB 0x3A -#define TPS80031_GPCH0_LSB 0x3B -#define TPS80031_GPCH0_MSB 0x3C - -/* SIM, MMC and Battery Detection */ -#define TPS80031_SIMDEBOUNCING 0xEB -#define TPS80031_SIMCTRL 0xEC -#define TPS80031_MMCDEBOUNCING 0xED -#define TPS80031_MMCCTRL 0xEE -#define TPS80031_BATDEBOUNCING 0xEF - -/* Vibrator Driver and PWMs */ -#define TPS80031_VIBCTRL 0x9B -#define TPS80031_VIBMODE 0x9C -#define TPS80031_PWM1ON 0xBA -#define TPS80031_PWM1OFF 0xBB -#define TPS80031_PWM2ON 0xBD -#define TPS80031_PWM2OFF 0xBE - -/* Control Interface */ -#define TPS80031_INT_STS_A 0xD0 -#define TPS80031_INT_STS_B 0xD1 -#define TPS80031_INT_STS_C 0xD2 -#define TPS80031_INT_MSK_LINE_A 0xD3 -#define TPS80031_INT_MSK_LINE_B 0xD4 -#define TPS80031_INT_MSK_LINE_C 0xD5 -#define TPS80031_INT_MSK_STS_A 0xD6 -#define TPS80031_INT_MSK_STS_B 0xD7 -#define TPS80031_INT_MSK_STS_C 0xD8 -#define TPS80031_TOGGLE1 0x90 -#define TPS80031_TOGGLE2 0x91 -#define TPS80031_TOGGLE3 0x92 -#define TPS80031_PWDNSTATUS1 0x93 -#define TPS80031_PWDNSTATUS2 0x94 -#define TPS80031_VALIDITY0 0x17 -#define TPS80031_VALIDITY1 0x18 -#define TPS80031_VALIDITY2 0x19 -#define TPS80031_VALIDITY3 0x1A -#define TPS80031_VALIDITY4 0x1B -#define TPS80031_VALIDITY5 0x1C -#define TPS80031_VALIDITY6 0x1D -#define TPS80031_VALIDITY7 0x1E - -/* Version number related register */ -#define TPS80031_JTAGVERNUM 0x87 -#define TPS80031_EPROM_REV 0xDF - -/* GPADC Trimming Bits. */ -#define TPS80031_GPADC_TRIM0 0xCC -#define TPS80031_GPADC_TRIM1 0xCD -#define TPS80031_GPADC_TRIM2 0xCE -#define TPS80031_GPADC_TRIM3 0xCF -#define TPS80031_GPADC_TRIM4 0xD0 -#define TPS80031_GPADC_TRIM5 0xD1 -#define TPS80031_GPADC_TRIM6 0xD2 -#define TPS80031_GPADC_TRIM7 0xD3 -#define TPS80031_GPADC_TRIM8 0xD4 -#define TPS80031_GPADC_TRIM9 0xD5 -#define TPS80031_GPADC_TRIM10 0xD6 -#define TPS80031_GPADC_TRIM11 0xD7 -#define TPS80031_GPADC_TRIM12 0xD8 -#define TPS80031_GPADC_TRIM13 0xD9 -#define TPS80031_GPADC_TRIM14 0xDA -#define TPS80031_GPADC_TRIM15 0xDB -#define TPS80031_GPADC_TRIM16 0xDC -#define TPS80031_GPADC_TRIM17 0xDD -#define TPS80031_GPADC_TRIM18 0xDE - -/* TPS80031_CONTROLLER_STAT1 bit fields */ -#define TPS80031_CONTROLLER_STAT1_BAT_TEMP 0 -#define TPS80031_CONTROLLER_STAT1_BAT_REMOVED 1 -#define TPS80031_CONTROLLER_STAT1_VBUS_DET 2 -#define TPS80031_CONTROLLER_STAT1_VAC_DET 3 -#define TPS80031_CONTROLLER_STAT1_FAULT_WDG 4 -#define TPS80031_CONTROLLER_STAT1_LINCH_GATED 6 -/* TPS80031_CONTROLLER_INT_MASK bit filed */ -#define TPS80031_CONTROLLER_INT_MASK_MVAC_DET 0 -#define TPS80031_CONTROLLER_INT_MASK_MVBUS_DET 1 -#define TPS80031_CONTROLLER_INT_MASK_MBAT_TEMP 2 -#define TPS80031_CONTROLLER_INT_MASK_MFAULT_WDG 3 -#define TPS80031_CONTROLLER_INT_MASK_MBAT_REMOVED 4 -#define TPS80031_CONTROLLER_INT_MASK_MLINCH_GATED 5 - -#define TPS80031_CHARGE_CONTROL_SUB_INT_MASK 0x3F - -/* TPS80031_PHOENIX_DEV_ON bit field */ -#define TPS80031_DEVOFF 0x1 - -#define TPS80031_EXT_CONTROL_CFG_TRANS 0 -#define TPS80031_EXT_CONTROL_CFG_STATE 1 - -/* State register field */ -#define TPS80031_STATE_OFF 0x00 -#define TPS80031_STATE_ON 0x01 -#define TPS80031_STATE_MASK 0x03 - -/* Trans register field */ -#define TPS80031_TRANS_ACTIVE_OFF 0x00 -#define TPS80031_TRANS_ACTIVE_ON 0x01 -#define TPS80031_TRANS_ACTIVE_MASK 0x03 -#define TPS80031_TRANS_SLEEP_OFF 0x00 -#define TPS80031_TRANS_SLEEP_ON 0x04 -#define TPS80031_TRANS_SLEEP_MASK 0x0C -#define TPS80031_TRANS_OFF_OFF 0x00 -#define TPS80031_TRANS_OFF_ACTIVE 0x10 -#define TPS80031_TRANS_OFF_MASK 0x30 - -#define TPS80031_EXT_PWR_REQ (TPS80031_PWR_REQ_INPUT_PREQ1 | \ - TPS80031_PWR_REQ_INPUT_PREQ2 | \ - TPS80031_PWR_REQ_INPUT_PREQ3) - -/* TPS80031_BBSPOR_CFG bit field */ -#define TPS80031_BBSPOR_CHG_EN 0x8 -#define TPS80031_MAX_REGISTER 0xFF - -struct i2c_client; - -/* Supported chips */ -enum chips { - TPS80031 = 0x00000001, - TPS80032 = 0x00000002, -}; - -enum { - TPS80031_INT_PWRON, - TPS80031_INT_RPWRON, - TPS80031_INT_SYS_VLOW, - TPS80031_INT_RTC_ALARM, - TPS80031_INT_RTC_PERIOD, - TPS80031_INT_HOT_DIE, - TPS80031_INT_VXX_SHORT, - TPS80031_INT_SPDURATION, - TPS80031_INT_WATCHDOG, - TPS80031_INT_BAT, - TPS80031_INT_SIM, - TPS80031_INT_MMC, - TPS80031_INT_RES, - TPS80031_INT_GPADC_RT, - TPS80031_INT_GPADC_SW2_EOC, - TPS80031_INT_CC_AUTOCAL, - TPS80031_INT_ID_WKUP, - TPS80031_INT_VBUSS_WKUP, - TPS80031_INT_ID, - TPS80031_INT_VBUS, - TPS80031_INT_CHRG_CTRL, - TPS80031_INT_EXT_CHRG, - TPS80031_INT_INT_CHRG, - TPS80031_INT_RES2, - TPS80031_INT_BAT_TEMP_OVRANGE, - TPS80031_INT_BAT_REMOVED, - TPS80031_INT_VBUS_DET, - TPS80031_INT_VAC_DET, - TPS80031_INT_FAULT_WDG, - TPS80031_INT_LINCH_GATED, - - /* Last interrupt id to get the end number */ - TPS80031_INT_NR, -}; - -/* TPS80031 Slave IDs */ -#define TPS80031_NUM_SLAVES 4 -#define TPS80031_SLAVE_ID0 0 -#define TPS80031_SLAVE_ID1 1 -#define TPS80031_SLAVE_ID2 2 -#define TPS80031_SLAVE_ID3 3 - -/* TPS80031 I2C addresses */ -#define TPS80031_I2C_ID0_ADDR 0x12 -#define TPS80031_I2C_ID1_ADDR 0x48 -#define TPS80031_I2C_ID2_ADDR 0x49 -#define TPS80031_I2C_ID3_ADDR 0x4A - -enum { - TPS80031_REGULATOR_VIO, - TPS80031_REGULATOR_SMPS1, - TPS80031_REGULATOR_SMPS2, - TPS80031_REGULATOR_SMPS3, - TPS80031_REGULATOR_SMPS4, - TPS80031_REGULATOR_VANA, - TPS80031_REGULATOR_LDO1, - TPS80031_REGULATOR_LDO2, - TPS80031_REGULATOR_LDO3, - TPS80031_REGULATOR_LDO4, - TPS80031_REGULATOR_LDO5, - TPS80031_REGULATOR_LDO6, - TPS80031_REGULATOR_LDO7, - TPS80031_REGULATOR_LDOLN, - TPS80031_REGULATOR_LDOUSB, - TPS80031_REGULATOR_VBUS, - TPS80031_REGULATOR_REGEN1, - TPS80031_REGULATOR_REGEN2, - TPS80031_REGULATOR_SYSEN, - TPS80031_REGULATOR_MAX, -}; - -/* Different configurations for the rails */ -enum { - /* USBLDO input selection */ - TPS80031_USBLDO_INPUT_VSYS = 0x00000001, - TPS80031_USBLDO_INPUT_PMID = 0x00000002, - - /* LDO3 output mode */ - TPS80031_LDO3_OUTPUT_VIB = 0x00000004, - - /* VBUS configuration */ - TPS80031_VBUS_DISCHRG_EN_PDN = 0x00000004, - TPS80031_VBUS_SW_ONLY = 0x00000008, - TPS80031_VBUS_SW_N_ID = 0x00000010, -}; - -/* External controls requests */ -enum tps80031_ext_control { - TPS80031_PWR_REQ_INPUT_NONE = 0x00000000, - TPS80031_PWR_REQ_INPUT_PREQ1 = 0x00000001, - TPS80031_PWR_REQ_INPUT_PREQ2 = 0x00000002, - TPS80031_PWR_REQ_INPUT_PREQ3 = 0x00000004, - TPS80031_PWR_OFF_ON_SLEEP = 0x00000008, - TPS80031_PWR_ON_ON_SLEEP = 0x00000010, -}; - -enum tps80031_pupd_pins { - TPS80031_PREQ1 = 0, - TPS80031_PREQ2A, - TPS80031_PREQ2B, - TPS80031_PREQ2C, - TPS80031_PREQ3, - TPS80031_NRES_WARM, - TPS80031_PWM_FORCE, - TPS80031_CHRG_EXT_CHRG_STATZ, - TPS80031_SIM, - TPS80031_MMC, - TPS80031_GPADC_START, - TPS80031_DVSI2C_SCL, - TPS80031_DVSI2C_SDA, - TPS80031_CTLI2C_SCL, - TPS80031_CTLI2C_SDA, -}; - -enum tps80031_pupd_settings { - TPS80031_PUPD_NORMAL, - TPS80031_PUPD_PULLDOWN, - TPS80031_PUPD_PULLUP, -}; - -struct tps80031 { - struct device *dev; - unsigned long chip_info; - int es_version; - struct i2c_client *clients[TPS80031_NUM_SLAVES]; - struct regmap *regmap[TPS80031_NUM_SLAVES]; - struct regmap_irq_chip_data *irq_data; -}; - -struct tps80031_pupd_init_data { - int input_pin; - int setting; -}; - -/* - * struct tps80031_regulator_platform_data - tps80031 regulator platform data. - * - * @reg_init_data: The regulator init data. - * @ext_ctrl_flag: External control flag for sleep/power request control. - * @config_flags: Configuration flag to configure the rails. - * It should be ORed of config enums. - */ - -struct tps80031_regulator_platform_data { - struct regulator_init_data *reg_init_data; - unsigned int ext_ctrl_flag; - unsigned int config_flags; -}; - -struct tps80031_platform_data { - int irq_base; - bool use_power_off; - struct tps80031_pupd_init_data *pupd_init_data; - int pupd_init_data_size; - struct tps80031_regulator_platform_data - *regulator_pdata[TPS80031_REGULATOR_MAX]; -}; - -static inline int tps80031_write(struct device *dev, int sid, - int reg, uint8_t val) -{ - struct tps80031 *tps80031 = dev_get_drvdata(dev); - - return regmap_write(tps80031->regmap[sid], reg, val); -} - -static inline int tps80031_writes(struct device *dev, int sid, int reg, - int len, uint8_t *val) -{ - struct tps80031 *tps80031 = dev_get_drvdata(dev); - - return regmap_bulk_write(tps80031->regmap[sid], reg, val, len); -} - -static inline int tps80031_read(struct device *dev, int sid, - int reg, uint8_t *val) -{ - struct tps80031 *tps80031 = dev_get_drvdata(dev); - unsigned int ival; - int ret; - - ret = regmap_read(tps80031->regmap[sid], reg, &ival); - if (ret < 0) { - dev_err(dev, "failed reading from reg 0x%02x\n", reg); - return ret; - } - - *val = ival; - return ret; -} - -static inline int tps80031_reads(struct device *dev, int sid, - int reg, int len, uint8_t *val) -{ - struct tps80031 *tps80031 = dev_get_drvdata(dev); - - return regmap_bulk_read(tps80031->regmap[sid], reg, val, len); -} - -static inline int tps80031_set_bits(struct device *dev, int sid, - int reg, uint8_t bit_mask) -{ - struct tps80031 *tps80031 = dev_get_drvdata(dev); - - return regmap_update_bits(tps80031->regmap[sid], reg, - bit_mask, bit_mask); -} - -static inline int tps80031_clr_bits(struct device *dev, int sid, - int reg, uint8_t bit_mask) -{ - struct tps80031 *tps80031 = dev_get_drvdata(dev); - - return regmap_update_bits(tps80031->regmap[sid], reg, bit_mask, 0); -} - -static inline int tps80031_update(struct device *dev, int sid, - int reg, uint8_t val, uint8_t mask) -{ - struct tps80031 *tps80031 = dev_get_drvdata(dev); - - return regmap_update_bits(tps80031->regmap[sid], reg, mask, val); -} - -static inline unsigned long tps80031_get_chip_info(struct device *dev) -{ - struct tps80031 *tps80031 = dev_get_drvdata(dev); - - return tps80031->chip_info; -} - -static inline int tps80031_get_pmu_version(struct device *dev) -{ - struct tps80031 *tps80031 = dev_get_drvdata(dev); - - return tps80031->es_version; -} - -static inline int tps80031_irq_get_virq(struct device *dev, int irq) -{ - struct tps80031 *tps80031 = dev_get_drvdata(dev); - - return regmap_irq_get_virq(tps80031->irq_data, irq); -} - -extern int tps80031_ext_power_req_config(struct device *dev, - unsigned long ext_ctrl_flag, int preq_bit, - int state_reg_add, int trans_reg_add); -#endif /*__LINUX_MFD_TPS80031_H */ -- cgit v1.2.3 From d755ad8dc752d44545613ea04d660aed674e540d Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:00 -0400 Subject: NFS: Create a new nfs_alloc_fattr_with_label() function For creating fattrs with the label field already allocated for us. I also update nfs_free_fattr() to free the label in the end. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 739ca1ef934f..88c3aed8ad39 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -426,9 +426,22 @@ extern void nfs_fattr_set_barrier(struct nfs_fattr *fattr); extern unsigned long nfs_inc_attr_generation_counter(void); extern struct nfs_fattr *nfs_alloc_fattr(void); +extern struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server); + +static inline void nfs4_label_free(struct nfs4_label *label) +{ +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + if (label) { + kfree(label->label); + kfree(label); + } +#endif +} static inline void nfs_free_fattr(const struct nfs_fattr *fattr) { + if (fattr) + nfs4_label_free(fattr->label); kfree(fattr); } -- cgit v1.2.3 From b1db9a401d464d526d5941f0544e7c9ea37fa731 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:01 -0400 Subject: NFS: Remove the nfs4_label from the nfs_entry struct And instead allocate the fattr using nfs_alloc_fattr_with_label() Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index e9698b6278a5..9960f6628066 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -753,7 +753,6 @@ struct nfs_entry { int eof; struct nfs_fh * fh; struct nfs_fattr * fattr; - struct nfs4_label *label; unsigned char d_type; struct nfs_server * server; }; -- cgit v1.2.3 From 68be1742c22983558f0f148a4467eb9127d56b86 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:02 -0400 Subject: NFS: Remove the nfs4_label from the nfs4_create_res struct Instead, use the label embedded in the attached fattr. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9960f6628066..5aba81b74c98 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1040,7 +1040,6 @@ struct nfs4_create_res { const struct nfs_server * server; struct nfs_fh * fh; struct nfs_fattr * fattr; - struct nfs4_label *label; struct nfs4_change_info dir_cinfo; }; -- cgit v1.2.3 From aa7ca3b2de190675543d84adaa1ff74e7867c76f Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:03 -0400 Subject: NFS: Remove the nfs4_label from the nfs4_link_res struct Again, use the fattr's label field instead. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 5aba81b74c98..d55bf3fd5167 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1079,7 +1079,6 @@ struct nfs4_link_res { struct nfs4_sequence_res seq_res; const struct nfs_server * server; struct nfs_fattr * fattr; - struct nfs4_label *label; struct nfs4_change_info cinfo; struct nfs_fattr * dir_attr; }; -- cgit v1.2.3 From 9558a007dbc383d48e7f5a123d0b5ff656c71068 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:04 -0400 Subject: NFS: Remove the label from the nfs4_lookup_res struct And usethe fattr's label field instead. I also adjust function calls to remove labels along the way. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index d55bf3fd5167..95219d5a8668 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1095,7 +1095,6 @@ struct nfs4_lookup_res { const struct nfs_server * server; struct nfs_fattr * fattr; struct nfs_fh * fh; - struct nfs4_label *label; }; struct nfs4_lookupp_arg { @@ -1740,8 +1739,7 @@ struct nfs_rpc_ops { int (*setattr) (struct dentry *, struct nfs_fattr *, struct iattr *); int (*lookup) (struct inode *, struct dentry *, - struct nfs_fh *, struct nfs_fattr *, - struct nfs4_label *); + struct nfs_fh *, struct nfs_fattr *); int (*lookupp) (struct inode *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *); int (*access) (struct inode *, struct nfs_access_entry *); -- cgit v1.2.3 From ba4bc8dc4d937df2b407393435a302550be0ad82 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:05 -0400 Subject: NFS: Remove the nfs4_label from the nfs4_lookupp_res struct Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 95219d5a8668..f0a685d9b8bd 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1108,7 +1108,6 @@ struct nfs4_lookupp_res { const struct nfs_server *server; struct nfs_fattr *fattr; struct nfs_fh *fh; - struct nfs4_label *label; }; struct nfs4_lookup_root_arg { @@ -1741,7 +1740,7 @@ struct nfs_rpc_ops { int (*lookup) (struct inode *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); int (*lookupp) (struct inode *, struct nfs_fh *, - struct nfs_fattr *, struct nfs4_label *); + struct nfs_fattr *); int (*access) (struct inode *, struct nfs_access_entry *); int (*readlink)(struct inode *, struct page *, unsigned int, unsigned int); -- cgit v1.2.3 From 76baa2b29c7161bc65a3051d311297b7d7fc827a Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:06 -0400 Subject: NFS: Remove the f_label from the nfs4_opendata and nfs_openres Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index f0a685d9b8bd..cb28e01ea41e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -488,7 +488,6 @@ struct nfs_openres { struct nfs4_change_info cinfo; __u32 rflags; struct nfs_fattr * f_attr; - struct nfs4_label *f_label; struct nfs_seqid * seqid; const struct nfs_server *server; fmode_t delegation_type; -- cgit v1.2.3 From 2ef61e0eaa333e4e9c348c41a4b7abfb34b8736d Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:07 -0400 Subject: NFS: Remove the nfs4_label from the nfs4_getattr_res Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index cb28e01ea41e..817f1bf5f187 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1063,7 +1063,6 @@ struct nfs4_getattr_res { struct nfs4_sequence_res seq_res; const struct nfs_server * server; struct nfs_fattr * fattr; - struct nfs4_label *label; }; struct nfs4_link_arg { @@ -1732,8 +1731,7 @@ struct nfs_rpc_ops { int (*submount) (struct fs_context *, struct nfs_server *); int (*try_get_tree) (struct fs_context *); int (*getattr) (struct nfs_server *, struct nfs_fh *, - struct nfs_fattr *, struct nfs4_label *, - struct inode *); + struct nfs_fattr *, struct inode *); int (*setattr) (struct dentry *, struct nfs_fattr *, struct iattr *); int (*lookup) (struct inode *, struct dentry *, -- cgit v1.2.3 From 1b00ad657997c8984a9e627a3bd37ea14f20beb2 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:08 -0400 Subject: NFS: Remove the nfs4_label from the nfs_setattrres Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 817f1bf5f187..967a0098f0a9 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -832,7 +832,6 @@ struct nfs_getaclres { struct nfs_setattrres { struct nfs4_sequence_res seq_res; struct nfs_fattr * fattr; - struct nfs4_label *label; const struct nfs_server * server; }; -- cgit v1.2.3 From d91bfc46426d3d772fc0d9d165e3435fd0f0a79e Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:09 -0400 Subject: NFS: Remove the nfs4_label argument from nfs_instantiate() Pull the label from the fattr instead. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 88c3aed8ad39..a8a9b71aeea6 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -532,7 +532,7 @@ extern struct dentry *nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label); extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, - struct nfs_fattr *fattr, struct nfs4_label *label); + struct nfs_fattr *fattr); extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); extern void nfs_access_zap_cache(struct inode *inode); extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, -- cgit v1.2.3 From cc6f32989c3202349b90edde0c4702b098410fe8 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:10 -0400 Subject: NFS: Remove the nfs4_label argument from nfs_add_or_obtain() Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index a8a9b71aeea6..6eda001b306b 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -529,8 +529,7 @@ extern void nfs_set_verifier(struct dentry * dentry, unsigned long verf); extern void nfs_clear_verifier_delegated(struct inode *inode); #endif /* IS_ENABLED(CONFIG_NFS_V4) */ extern struct dentry *nfs_add_or_obtain(struct dentry *dentry, - struct nfs_fh *fh, struct nfs_fattr *fattr, - struct nfs4_label *label); + struct nfs_fh *fh, struct nfs_fattr *fattr); extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); -- cgit v1.2.3 From cf7ab00aabbf9c8f1ec72edff15849ddc23aa6a7 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:11 -0400 Subject: NFS: Remove the nfs4_label argument from nfs_fhget() Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 6eda001b306b..c36c6a559fc9 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -388,7 +388,7 @@ extern void nfs_zap_caches(struct inode *); extern void nfs_set_inode_stale(struct inode *inode); extern void nfs_invalidate_atime(struct inode *); extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *, - struct nfs_fattr *, struct nfs4_label *); + struct nfs_fattr *); struct inode *nfs_ilookup(struct super_block *sb, struct nfs_fattr *, struct nfs_fh *); extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); -- cgit v1.2.3 From dd225cb3b02b827271a2284f89102fc81efcbf6f Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 22 Oct 2021 13:11:12 -0400 Subject: NFS: Remove the nfs4_label argument from nfs_setsecurity Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index c36c6a559fc9..05f249f20f55 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -409,8 +409,7 @@ extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *map extern int nfs_revalidate_mapping_rcu(struct inode *inode); extern int nfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); -extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, - struct nfs4_label *label); +extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); extern void put_nfs_open_context(struct nfs_open_context *ctx); extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode); -- cgit v1.2.3 From 3990ed4c426652fcd469f8c9dc08156294b36c28 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 5 Nov 2021 18:40:14 -0700 Subject: bpf: Stop caching subprog index in the bpf_pseudo_func insn This patch is to fix an out-of-bound access issue when jit-ing the bpf_pseudo_func insn (i.e. ld_imm64 with src_reg == BPF_PSEUDO_FUNC) In jit_subprog(), it currently reuses the subprog index cached in insn[1].imm. This subprog index is an index into a few array related to subprogs. For example, in jit_subprog(), it is an index to the newly allocated 'struct bpf_prog **func' array. The subprog index was cached in insn[1].imm after add_subprog(). However, this could become outdated (and too big in this case) if some subprogs are completely removed during dead code elimination (in adjust_subprog_starts_after_remove). The cached index in insn[1].imm is not updated accordingly and causing out-of-bound issue in the later jit_subprog(). Unlike bpf_pseudo_'func' insn, the current bpf_pseudo_'call' insn is handling the DCE properly by calling find_subprog(insn->imm) to figure out the index instead of caching the subprog index. The existing bpf_adj_branches() will adjust the insn->imm whenever insn is added or removed. Instead of having two ways handling subprog index, this patch is to make bpf_pseudo_func works more like bpf_pseudo_call. First change is to stop caching the subprog index result in insn[1].imm after add_subprog(). The verification process will use find_subprog(insn->imm) to figure out the subprog index. Second change is in bpf_adj_branches() and have it to adjust the insn->imm for the bpf_pseudo_func insn also whenever insn is added or removed. Third change is in jit_subprog(). Like the bpf_pseudo_call handling, bpf_pseudo_func temporarily stores the find_subprog() result in insn->off. It is fine because the prog's insn has been finalized at this point. insn->off will be reset back to 0 later to avoid confusing the userspace prog dump tool. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211106014014.651018-1-kafai@fb.com --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2be6dfd68df9..f715e8863f4d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -484,6 +484,12 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) aux->ctx_field_size = size; } +static inline bool bpf_pseudo_func(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && + insn->src_reg == BPF_PSEUDO_FUNC; +} + struct bpf_prog_ops { int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); -- cgit v1.2.3 From 8587ca6f34152ea650bad4b2db68456601159024 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Nov 2021 13:35:07 -0700 Subject: mm: move kvmalloc-related functions to slab.h Not all files in the kernel should include mm.h. Migrating callers from kmalloc to kvmalloc is easier if the kvmalloc functions are in slab.h. [akpm@linux-foundation.org: move the new kvrealloc() also] [akpm@linux-foundation.org: drivers/hwmon/occ/p9_sbe.c needs slab.h] Link: https://lkml.kernel.org/r/20210622215757.3525604-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Pekka Enberg Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 34 ---------------------------------- include/linux/slab.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 73a52aba448f..32b2ecc47408 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -799,40 +799,6 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif -extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -static inline void *kvmalloc(size_t size, gfp_t flags) -{ - return kvmalloc_node(size, flags, NUMA_NO_NODE); -} -static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) -{ - return kvmalloc_node(size, flags | __GFP_ZERO, node); -} -static inline void *kvzalloc(size_t size, gfp_t flags) -{ - return kvmalloc(size, flags | __GFP_ZERO); -} - -static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) -{ - size_t bytes; - - if (unlikely(check_mul_overflow(n, size, &bytes))) - return NULL; - - return kvmalloc(bytes, flags); -} - -static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) -{ - return kvmalloc_array(n, size, flags | __GFP_ZERO); -} - -extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, - gfp_t flags); -extern void kvfree(const void *addr); -extern void kvfree_sensitive(const void *addr, size_t len); - static inline int head_compound_mapcount(struct page *head) { return atomic_read(compound_mapcount_ptr(head)) + 1; diff --git a/include/linux/slab.h b/include/linux/slab.h index 083f3ce550bc..c0d46b6fa12a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -732,6 +732,40 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) return kmalloc_node(size, flags | __GFP_ZERO, node); } +extern void *kvmalloc_node(size_t size, gfp_t flags, int node); +static inline void *kvmalloc(size_t size, gfp_t flags) +{ + return kvmalloc_node(size, flags, NUMA_NO_NODE); +} +static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) +{ + return kvmalloc_node(size, flags | __GFP_ZERO, node); +} +static inline void *kvzalloc(size_t size, gfp_t flags) +{ + return kvmalloc(size, flags | __GFP_ZERO); +} + +static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + + return kvmalloc(bytes, flags); +} + +static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) +{ + return kvmalloc_array(n, size, flags | __GFP_ZERO); +} + +extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, + gfp_t flags); +extern void kvfree(const void *addr); +extern void kvfree_sensitive(const void *addr, size_t len); + unsigned int kmem_cache_size(struct kmem_cache *s); void __init kmem_cache_init_late(void); -- cgit v1.2.3 From b47291ef02b0bee85ffb7efd6c336060ad1fe1a4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 5 Nov 2021 13:35:17 -0700 Subject: mm, slub: change percpu partial accounting from objects to pages With CONFIG_SLUB_CPU_PARTIAL enabled, SLUB keeps a percpu list of partial slabs that can be promoted to cpu slab when the previous one is depleted, without accessing the shared partial list. A slab can be added to this list by 1) refill of an empty list from get_partial_node() - once we really have to access the shared partial list, we acquire multiple slabs to amortize the cost of locking, and 2) first free to a previously full slab - instead of putting the slab on a shared partial list, we can more cheaply freeze it and put it on the per-cpu list. To control how large a percpu partial list can grow for a kmem cache, set_cpu_partial() calculates a target number of free objects on each cpu's percpu partial list, and this can be also set by the sysfs file cpu_partial. However, the tracking of actual number of objects is imprecise, in order to limit overhead from cpu X freeing an objects to a slab on percpu partial list of cpu Y. Basically, the percpu partial slabs form a single linked list, and when we add a new slab to the list with current head "oldpage", we set in the struct page of the slab we're adding: page->pages = oldpage->pages + 1; // this is precise page->pobjects = oldpage->pobjects + (page->objects - page->inuse); page->next = oldpage; Thus the real number of free objects in the slab (objects - inuse) is only determined at the moment of adding the slab to the percpu partial list, and further freeing doesn't update the pobjects counter nor propagate it to the current list head. As Jann reports [1], this can easily lead to large inaccuracies, where the target number of objects (up to 30 by default) can translate to the same number of (empty) slab pages on the list. In case 2) above, we put a slab with 1 free object on the list, thus only increase page->pobjects by 1, even if there are subsequent frees on the same slab. Jann has noticed this in practice and so did we [2] when investigating significant increase of kmemcg usage after switching from SLAB to SLUB. While this is no longer a problem in kmemcg context thanks to the accounting rewrite in 5.9, the memory waste is still not ideal and it's questionable whether it makes sense to perform free object count based control when object counts can easily become so much inaccurate. So this patch converts the accounting to be based on number of pages only (which is precise) and removes the page->pobjects field completely. This is also ultimately simpler. To retain the existing set_cpu_partial() heuristic, first calculate the target number of objects as previously, but then convert it to target number of pages by assuming the pages will be half-filled on average. This assumption might obviously also be inaccurate in practice, but cannot degrade to actual number of pages being equal to the target number of objects. We could also skip the intermediate step with target number of objects and rewrite the heuristic in terms of pages. However we still have the sysfs file cpu_partial which uses number of objects and could break existing users if it suddenly becomes number of pages, so this patch doesn't do that. In practice, after this patch the heuristics limit the size of percpu partial list up to 2 pages. In case of a reported regression (which would mean some workload has benefited from the previous imprecise object based counting), we can tune the heuristics to get a better compromise within the new scheme, while still avoid the unexpectedly long percpu partial lists. [1] https://lore.kernel.org/linux-mm/CAG48ez2Qx5K1Cab-m8BdSibp6wLTip6ro4=-umR7BLsEgjEYzA@mail.gmail.com/ [2] https://lore.kernel.org/all/2f0f46e8-2535-410a-1859-e9cfa4e57c18@suse.cz/ ========== Evaluation ========== Mel was kind enough to run v1 through mmtests machinery for netperf (localhost) and hackbench and, for most significant results see below. So there are some apparent regressions, especially with hackbench, which I think ultimately boils down to having shorter percpu partial lists on average and some benchmarks benefiting from longer ones. Monitoring slab usage also indicated less memory usage by slab. Based on that, the following patch will bump the defaults to allow longer percpu partial lists than after this patch. However the goal is certainly not such that we would limit the percpu partial lists to 30 pages just because previously a specific alloc/free pattern could lead to the limit of 30 objects translate to a limit to 30 pages - that would make little sense. This is a correctness patch, and if a workload benefits from larger lists, the sysfs tuning knobs are still there to allow that. Netperf 2-socket Intel(R) Xeon(R) Gold 5218R CPU @ 2.10GHz (20 cores, 40 threads per socket), 384GB RAM TCP-RR: hmean before 127045.79 after 121092.94 (-4.69%, worse) stddev before 2634.37 after 1254.08 UDP-RR: hmean before 166985.45 after 160668.94 ( -3.78%, worse) stddev before 4059.69 after 1943.63 2-socket Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz (20 cores, 40 threads per socket), 512GB RAM TCP-RR: hmean before 84173.25 after 76914.72 ( -8.62%, worse) UDP-RR: hmean before 93571.12 after 96428.69 ( 3.05%, better) stddev before 23118.54 after 16828.14 2-socket Intel(R) Xeon(R) CPU E5-2670 v3 @ 2.30GHz (12 cores, 24 threads per socket), 64GB RAM TCP-RR: hmean before 49984.92 after 48922.27 ( -2.13%, worse) stddev before 6248.15 after 4740.51 UDP-RR: hmean before 61854.31 after 68761.81 ( 11.17%, better) stddev before 4093.54 after 5898.91 other machines - within 2% Hackbench (results before and after the patch, negative % means worse) 2-socket AMD EPYC 7713 (64 cores, 128 threads per core), 256GB RAM hackbench-process-sockets Amean 1 0.5380 0.5583 ( -3.78%) Amean 4 0.7510 0.8150 ( -8.52%) Amean 7 0.7930 0.9533 ( -20.22%) Amean 12 0.7853 1.1313 ( -44.06%) Amean 21 1.1520 1.4993 ( -30.15%) Amean 30 1.6223 1.9237 ( -18.57%) Amean 48 2.6767 2.9903 ( -11.72%) Amean 79 4.0257 5.1150 ( -27.06%) Amean 110 5.5193 7.4720 ( -35.38%) Amean 141 7.2207 9.9840 ( -38.27%) Amean 172 8.4770 12.1963 ( -43.88%) Amean 203 9.6473 14.3137 ( -48.37%) Amean 234 11.3960 18.7917 ( -64.90%) Amean 265 13.9627 22.4607 ( -60.86%) Amean 296 14.9163 26.0483 ( -74.63%) hackbench-thread-sockets Amean 1 0.5597 0.5877 ( -5.00%) Amean 4 0.7913 0.8960 ( -13.23%) Amean 7 0.8190 1.0017 ( -22.30%) Amean 12 0.9560 1.1727 ( -22.66%) Amean 21 1.7587 1.5660 ( 10.96%) Amean 30 2.4477 1.9807 ( 19.08%) Amean 48 3.4573 3.0630 ( 11.41%) Amean 79 4.7903 5.1733 ( -8.00%) Amean 110 6.1370 7.4220 ( -20.94%) Amean 141 7.5777 9.2617 ( -22.22%) Amean 172 9.2280 11.0907 ( -20.18%) Amean 203 10.2793 13.3470 ( -29.84%) Amean 234 11.2410 17.1070 ( -52.18%) Amean 265 12.5970 23.3323 ( -85.22%) Amean 296 17.1540 24.2857 ( -41.57%) 2-socket Intel(R) Xeon(R) Gold 5218R CPU @ 2.10GHz (20 cores, 40 threads per socket), 384GB RAM hackbench-process-sockets Amean 1 0.5760 0.4793 ( 16.78%) Amean 4 0.9430 0.9707 ( -2.93%) Amean 7 1.5517 1.8843 ( -21.44%) Amean 12 2.4903 2.7267 ( -9.49%) Amean 21 3.9560 4.2877 ( -8.38%) Amean 30 5.4613 5.8343 ( -6.83%) Amean 48 8.5337 9.2937 ( -8.91%) Amean 79 14.0670 15.2630 ( -8.50%) Amean 110 19.2253 21.2467 ( -10.51%) Amean 141 23.7557 25.8550 ( -8.84%) Amean 172 28.4407 29.7603 ( -4.64%) Amean 203 33.3407 33.9927 ( -1.96%) Amean 234 38.3633 39.1150 ( -1.96%) Amean 265 43.4420 43.8470 ( -0.93%) Amean 296 48.3680 48.9300 ( -1.16%) hackbench-thread-sockets Amean 1 0.6080 0.6493 ( -6.80%) Amean 4 1.0000 1.0513 ( -5.13%) Amean 7 1.6607 2.0260 ( -22.00%) Amean 12 2.7637 2.9273 ( -5.92%) Amean 21 5.0613 4.5153 ( 10.79%) Amean 30 6.3340 6.1140 ( 3.47%) Amean 48 9.0567 9.5577 ( -5.53%) Amean 79 14.5657 15.7983 ( -8.46%) Amean 110 19.6213 21.6333 ( -10.25%) Amean 141 24.1563 26.2697 ( -8.75%) Amean 172 28.9687 30.2187 ( -4.32%) Amean 203 33.9763 34.6970 ( -2.12%) Amean 234 38.8647 39.3207 ( -1.17%) Amean 265 44.0813 44.1507 ( -0.16%) Amean 296 49.2040 49.4330 ( -0.47%) 2-socket Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz (20 cores, 40 threads per socket), 512GB RAM hackbench-process-sockets Amean 1 0.5027 0.5017 ( 0.20%) Amean 4 1.1053 1.2033 ( -8.87%) Amean 7 1.8760 2.1820 ( -16.31%) Amean 12 2.9053 3.1810 ( -9.49%) Amean 21 4.6777 4.9920 ( -6.72%) Amean 30 6.5180 6.7827 ( -4.06%) Amean 48 10.0710 10.5227 ( -4.48%) Amean 79 16.4250 17.5053 ( -6.58%) Amean 110 22.6203 24.4617 ( -8.14%) Amean 141 28.0967 31.0363 ( -10.46%) Amean 172 34.4030 36.9233 ( -7.33%) Amean 203 40.5933 43.0850 ( -6.14%) Amean 234 46.6477 48.7220 ( -4.45%) Amean 265 53.0530 53.9597 ( -1.71%) Amean 296 59.2760 59.9213 ( -1.09%) hackbench-thread-sockets Amean 1 0.5363 0.5330 ( 0.62%) Amean 4 1.1647 1.2157 ( -4.38%) Amean 7 1.9237 2.2833 ( -18.70%) Amean 12 2.9943 3.3110 ( -10.58%) Amean 21 4.9987 5.1880 ( -3.79%) Amean 30 6.7583 7.0043 ( -3.64%) Amean 48 10.4547 10.8353 ( -3.64%) Amean 79 16.6707 17.6790 ( -6.05%) Amean 110 22.8207 24.4403 ( -7.10%) Amean 141 28.7090 31.0533 ( -8.17%) Amean 172 34.9387 36.8260 ( -5.40%) Amean 203 41.1567 43.0450 ( -4.59%) Amean 234 47.3790 48.5307 ( -2.43%) Amean 265 53.9543 54.6987 ( -1.38%) Amean 296 60.0820 60.2163 ( -0.22%) 1-socket Intel(R) Xeon(R) CPU E3-1240 v5 @ 3.50GHz (4 cores, 8 threads), 32 GB RAM hackbench-process-sockets Amean 1 1.4760 1.5773 ( -6.87%) Amean 3 3.9370 4.0910 ( -3.91%) Amean 5 6.6797 6.9357 ( -3.83%) Amean 7 9.3367 9.7150 ( -4.05%) Amean 12 15.7627 16.1400 ( -2.39%) Amean 18 23.5360 23.6890 ( -0.65%) Amean 24 31.0663 31.3137 ( -0.80%) Amean 30 38.7283 39.0037 ( -0.71%) Amean 32 41.3417 41.6097 ( -0.65%) hackbench-thread-sockets Amean 1 1.5250 1.6043 ( -5.20%) Amean 3 4.0897 4.2603 ( -4.17%) Amean 5 6.7760 7.0933 ( -4.68%) Amean 7 9.4817 9.9157 ( -4.58%) Amean 12 15.9610 16.3937 ( -2.71%) Amean 18 23.9543 24.3417 ( -1.62%) Amean 24 31.4400 31.7217 ( -0.90%) Amean 30 39.2457 39.5467 ( -0.77%) Amean 32 41.8267 42.1230 ( -0.71%) 2-socket Intel(R) Xeon(R) CPU E5-2670 v3 @ 2.30GHz (12 cores, 24 threads per socket), 64GB RAM hackbench-process-sockets Amean 1 1.0347 1.0880 ( -5.15%) Amean 4 1.7267 1.8527 ( -7.30%) Amean 7 2.6707 2.8110 ( -5.25%) Amean 12 4.1617 4.3383 ( -4.25%) Amean 21 7.0070 7.2600 ( -3.61%) Amean 30 9.9187 10.2397 ( -3.24%) Amean 48 15.6710 16.3923 ( -4.60%) Amean 79 24.7743 26.1247 ( -5.45%) Amean 110 34.3000 35.9307 ( -4.75%) Amean 141 44.2043 44.8010 ( -1.35%) Amean 172 54.2430 54.7260 ( -0.89%) Amean 192 60.6557 60.9777 ( -0.53%) hackbench-thread-sockets Amean 1 1.0610 1.1353 ( -7.01%) Amean 4 1.7543 1.9140 ( -9.10%) Amean 7 2.7840 2.9573 ( -6.23%) Amean 12 4.3813 4.4937 ( -2.56%) Amean 21 7.3460 7.5350 ( -2.57%) Amean 30 10.2313 10.5190 ( -2.81%) Amean 48 15.9700 16.5940 ( -3.91%) Amean 79 25.3973 26.6637 ( -4.99%) Amean 110 35.1087 36.4797 ( -3.91%) Amean 141 45.8220 46.3053 ( -1.05%) Amean 172 55.4917 55.7320 ( -0.43%) Amean 192 62.7490 62.5410 ( 0.33%) Link: https://lkml.kernel.org/r/20211012134651.11258-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Jann Horn Cc: Roman Gushchin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 -- include/linux/slub_def.h | 13 ++----------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..68ffa064b7a8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -124,10 +124,8 @@ struct page { struct page *next; #ifdef CONFIG_64BIT int pages; /* Nr of pages left */ - int pobjects; /* Approximate count */ #else short int pages; - short int pobjects; #endif }; }; diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 85499f0586b0..0fa751b946fa 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -99,6 +99,8 @@ struct kmem_cache { #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; + /* Number of per cpu partial pages to keep around */ + unsigned int cpu_partial_pages; #endif struct kmem_cache_order_objects oo; @@ -141,17 +143,6 @@ struct kmem_cache { struct kmem_cache_node *node[MAX_NUMNODES]; }; -#ifdef CONFIG_SLUB_CPU_PARTIAL -#define slub_cpu_partial(s) ((s)->cpu_partial) -#define slub_set_cpu_partial(s, n) \ -({ \ - slub_cpu_partial(s) = (n); \ -}) -#else -#define slub_cpu_partial(s) (0) -#define slub_set_cpu_partial(s, n) -#endif /* CONFIG_SLUB_CPU_PARTIAL */ - #ifdef CONFIG_SYSFS #define SLAB_SUPPORTS_SYSFS void sysfs_slab_unlink(struct kmem_cache *); -- cgit v1.2.3 From 96c84dde362a3e4ff67a12eaac2ea6e88e963c07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:35:30 -0700 Subject: mm: don't include in Not required at all, and having this causes a huge kernel rebuild as soon as something in dax.h changes. Link: https://lkml.kernel.org/r/20210921082253.1859794-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Naoya Horiguchi Reviewed-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 4091692bed8c..097bbbb37493 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -8,7 +8,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From 7857ccdf94e94f1dd56496b90084fdcaa5e655a4 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:35:33 -0700 Subject: lib/stackdepot: include gfp.h Patch series "stackdepot, kasan, workqueue: Avoid expanding stackdepot slabs when holding raw_spin_lock", v2. Shuah Khan reported [1]: | When CONFIG_PROVE_RAW_LOCK_NESTING=y and CONFIG_KASAN are enabled, | kasan_record_aux_stack() runs into "BUG: Invalid wait context" when | it tries to allocate memory attempting to acquire spinlock in page | allocation code while holding workqueue pool raw_spinlock. | | There are several instances of this problem when block layer tries | to __queue_work(). Call trace from one of these instances is below: | | kblockd_mod_delayed_work_on() | mod_delayed_work_on() | __queue_delayed_work() | __queue_work() (rcu_read_lock, raw_spin_lock pool->lock held) | insert_work() | kasan_record_aux_stack() | kasan_save_stack() | stack_depot_save() | alloc_pages() | __alloc_pages() | get_page_from_freelist() | rm_queue() | rm_queue_pcplist() | local_lock_irqsave(&pagesets.lock, flags); | [ BUG: Invalid wait context triggered ] PROVE_RAW_LOCK_NESTING is pointing out that (on RT kernels) the locking rules are being violated. More generally, memory is being allocated from a non-preemptive context (raw_spin_lock'd c-s) where it is not allowed. To properly fix this, we must prevent stackdepot from replenishing its "stack slab" pool if memory allocations cannot be done in the current context: it's a bug to use either GFP_ATOMIC nor GFP_NOWAIT in certain non-preemptive contexts, including raw_spin_locks (see gfp.h and commit ab00db216c9c7). The only downside is that saving a stack trace may fail if: stackdepot runs out of space AND the same stack trace has not been recorded before. I expect this to be unlikely, and a simple experiment (boot the kernel) didn't result in any failure to record stack trace from insert_work(). The series includes a few minor fixes to stackdepot that I noticed in preparing the series. It then introduces __stack_depot_save(), which exposes the option to force stackdepot to not allocate any memory. Finally, KASAN is changed to use the new stackdepot interface and provide kasan_record_aux_stack_noalloc(), which is then used by workqueue code. [1] https://lkml.kernel.org/r/20210902200134.25603-1-skhan@linuxfoundation.org This patch (of 6): refers to gfp_t, but doesn't include gfp.h. Fix it by including . Link: https://lkml.kernel.org/r/20210913112609.2651084-1-elver@google.com Link: https://lkml.kernel.org/r/20210913112609.2651084-2-elver@google.com Signed-off-by: Marco Elver Tested-by: Shuah Khan Acked-by: Sebastian Andrzej Siewior Reviewed-by: Andrey Konovalov Cc: Tejun Heo Cc: Lai Jiangshan Cc: Walter Wu Cc: Thomas Gleixner Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Vijayanand Jitta Cc: Vinayak Menon Cc: "Gustavo A. R. Silva" Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackdepot.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 6bb4bc1a5f54..97b36dc53301 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -11,6 +11,8 @@ #ifndef _LINUX_STACKDEPOT_H #define _LINUX_STACKDEPOT_H +#include + typedef u32 depot_stack_handle_t; depot_stack_handle_t stack_depot_save(unsigned long *entries, -- cgit v1.2.3 From 11ac25c62cd2f3bb8da9e1df2e71afdebe76f093 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:35:39 -0700 Subject: lib/stackdepot: introduce __stack_depot_save() Add __stack_depot_save(), which provides more fine-grained control over stackdepot's memory allocation behaviour, in case stackdepot runs out of "stack slabs". Normally stackdepot uses alloc_pages() in case it runs out of space; passing can_alloc==false to __stack_depot_save() prohibits this, at the cost of more likely failure to record a stack trace. Link: https://lkml.kernel.org/r/20210913112609.2651084-4-elver@google.com Signed-off-by: Marco Elver Tested-by: Shuah Khan Acked-by: Sebastian Andrzej Siewior Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: "Gustavo A. R. Silva" Cc: Lai Jiangshan Cc: Taras Madan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vijayanand Jitta Cc: Vinayak Menon Cc: Walter Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackdepot.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 97b36dc53301..b2f7e7c6ba54 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -15,6 +15,10 @@ typedef u32 depot_stack_handle_t; +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t gfp_flags, bool can_alloc); + depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); -- cgit v1.2.3 From 7cb3007ce2da27ec02a1a3211941e7fe6875b642 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:35:46 -0700 Subject: kasan: generic: introduce kasan_record_aux_stack_noalloc() Introduce a variant of kasan_record_aux_stack() that does not do any memory allocation through stackdepot. This will permit using it in contexts that cannot allocate any memory. Link: https://lkml.kernel.org/r/20210913112609.2651084-6-elver@google.com Signed-off-by: Marco Elver Tested-by: Shuah Khan Acked-by: Sebastian Andrzej Siewior Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: "Gustavo A. R. Silva" Cc: Lai Jiangshan Cc: Taras Madan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vijayanand Jitta Cc: Vinayak Menon Cc: Walter Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index dd874a1ee862..736d7b458996 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -370,12 +370,14 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); +void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} +static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ -- cgit v1.2.3 From 86cffecdeaa278444870c8745ab166a65865dbf0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:19 -0700 Subject: Compiler Attributes: add __alloc_size() for better bounds checking GCC and Clang can use the "alloc_size" attribute to better inform the results of __builtin_object_size() (for compile-time constant values). Clang can additionally use alloc_size to inform the results of __builtin_dynamic_object_size() (for run-time values). Because GCC sees the frequent use of struct_size() as an allocator size argument, and notices it can return SIZE_MAX (the overflow indication), it complains about these call sites overflowing (since SIZE_MAX is greater than the default -Walloc-size-larger-than=PTRDIFF_MAX). This isn't helpful since we already know a SIZE_MAX will be caught at run-time (this was an intentional design). To deal with this, we must disable this check as it is both a false positive and redundant. (Clang does not have this warning option.) Unfortunately, just checking the -Wno-alloc-size-larger-than is not sufficient to make the __alloc_size attribute behave correctly under older GCC versions. The attribute itself must be disabled in those situations too, as there appears to be no way to reliably silence the SIZE_MAX constant expression cases for GCC versions less than 9.1: In file included from ./include/linux/resource_ext.h:11, from ./include/linux/pci.h:40, from drivers/net/ethernet/intel/ixgbe/ixgbe.h:9, from drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c:4: In function 'kmalloc_node', inlined from 'ixgbe_alloc_q_vector' at ./include/linux/slab.h:743:9: ./include/linux/slab.h:618:9: error: argument 1 value '18446744073709551615' exceeds maximum object size 9223372036854775807 [-Werror=alloc-size-larger-than=] return __kmalloc_node(size, flags, node); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/slab.h: In function 'ixgbe_alloc_q_vector': ./include/linux/slab.h:455:7: note: in a call to allocation function '__kmalloc_node' declared here void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_slab_alignment __malloc; ^~~~~~~~~~~~~~ Specifically: '-Wno-alloc-size-larger-than' is not correctly handled by GCC < 9.1 https://godbolt.org/z/hqsfG7q84 (doesn't disable) https://godbolt.org/z/P9jdrPTYh (doesn't admit to not knowing about option) https://godbolt.org/z/465TPMWKb (only warns when other warnings appear) '-Walloc-size-larger-than=18446744073709551615' is not handled by GCC < 8.2 https://godbolt.org/z/73hh1EPxz (ignores numeric value) Since anything marked with __alloc_size would also qualify for marking with __malloc, just include __malloc along with it to avoid redundant markings. (Suggested by Linus Torvalds.) Finally, make sure checkpatch.pl doesn't get confused about finding the __alloc_size attribute on functions. (Thanks to Joe Perches.) Link: https://lkml.kernel.org/r/20210930222704.2631604-3-keescook@chromium.org Signed-off-by: Kees Cook Tested-by: Randy Dunlap Cc: Andy Whitcroft Cc: Christoph Lameter Cc: Daniel Micay Cc: David Rientjes Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Pekka Enberg Cc: Tejun Heo Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 8 ++++++++ include/linux/compiler_attributes.h | 10 ++++++++++ include/linux/compiler_types.h | 12 ++++++++++++ 3 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index bd2b881c6b63..b9d5f9c373a0 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -144,3 +144,11 @@ #else #define __diag_GCC_8(s) #endif + +/* + * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size" + * attribute) do not work, and must be disabled. + */ +#if GCC_VERSION < 90100 +#undef __alloc_size__ +#endif diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index e6ec63403965..3de06a8fae73 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -33,6 +33,15 @@ #define __aligned(x) __attribute__((__aligned__(x))) #define __aligned_largest __attribute__((__aligned__)) +/* + * Note: do not use this directly. Instead, use __alloc_size() since it is conditionally + * available and includes other attributes. + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alloc_005fsize-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#alloc-size + */ +#define __alloc_size__(x, ...) __attribute__((__alloc_size__(x, ## __VA_ARGS__))) + /* * Note: users of __always_inline currently do not write "inline" themselves, * which seems to be required by gcc to apply the attribute according @@ -153,6 +162,7 @@ /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-malloc-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#malloc */ #define __malloc __attribute__((__malloc__)) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index b6ff83a714ca..4f2203c4a257 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -250,6 +250,18 @@ struct ftrace_likely_data { # define __cficanonical #endif +/* + * Any place that could be marked with the "alloc_size" attribute is also + * a place to be marked with the "malloc" attribute. Do this as part of the + * __alloc_size macro to avoid redundant attributes and to avoid missing a + * __malloc marking. + */ +#ifdef __alloc_size__ +# define __alloc_size(x, ...) __alloc_size__(x, ## __VA_ARGS__) __malloc +#else +# define __alloc_size(x, ...) __malloc +#endif + #ifndef asm_volatile_goto #define asm_volatile_goto(x...) asm goto(x) #endif -- cgit v1.2.3 From 72d67229f522e3331d1eabd9f58d36ae080eb228 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:23 -0700 Subject: slab: clean up function prototypes Based on feedback from Joe Perches and Linus Torvalds, regularize the slab function prototypes before making attribute changes. Link: https://lkml.kernel.org/r/20210930222704.2631604-4-keescook@chromium.org Signed-off-by: Kees Cook Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Andy Whitcroft Cc: Daniel Micay Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: Joe Perches Cc: John Hubbard Cc: kernel test robot Cc: Lukas Bulwahn Cc: Matt Porter Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Randy Dunlap Cc: Souptick Joarder Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 68 ++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index c0d46b6fa12a..d05de03bdcdd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -152,8 +152,8 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); -void kmem_cache_destroy(struct kmem_cache *); -int kmem_cache_shrink(struct kmem_cache *); +void kmem_cache_destroy(struct kmem_cache *s); +int kmem_cache_shrink(struct kmem_cache *s); /* * Please use this macro to create slab caches. Simply specify the @@ -181,11 +181,11 @@ int kmem_cache_shrink(struct kmem_cache *); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc(const void *, size_t, gfp_t); -void kfree(const void *); -void kfree_sensitive(const void *); -size_t __ksize(const void *); -size_t ksize(const void *); +void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags); +void kfree(const void *objp); +void kfree_sensitive(const void *objp); +size_t __ksize(const void *objp); +size_t ksize(const void *objp); #ifdef CONFIG_PRINTK bool kmem_valid_obj(void *object); void kmem_dump_obj(void *object); @@ -426,8 +426,8 @@ static __always_inline unsigned int __kmalloc_index(size_t size, #endif /* !CONFIG_SLOB */ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; -void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; -void kmem_cache_free(struct kmem_cache *, void *); +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc; +void kmem_cache_free(struct kmem_cache *s, void *objp); /* * Bulk allocation and freeing operations. These are accelerated in an @@ -436,8 +436,8 @@ void kmem_cache_free(struct kmem_cache *, void *); * * Note that interrupts must be enabled when calling these functions. */ -void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); -int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p); /* * Caller must not use kfree_bulk() on memory not originally allocated @@ -450,7 +450,8 @@ static __always_inline void kfree_bulk(size_t size, void **p) #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; -void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; +void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment + __malloc; #else static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) { @@ -464,25 +465,24 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f #endif #ifdef CONFIG_TRACING -extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc; +extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) + __assume_slab_alignment __malloc; #ifdef CONFIG_NUMA -extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment __malloc; +extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) __assume_slab_alignment __malloc; #else -static __always_inline void * -kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) +static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, + gfp_t gfpflags, int node, + size_t size) { return kmem_cache_alloc_trace(s, gfpflags, size); } #endif /* CONFIG_NUMA */ #else /* CONFIG_TRACING */ -static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, - gfp_t flags, size_t size) +static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, + size_t size) { void *ret = kmem_cache_alloc(s, flags); @@ -490,10 +490,8 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, return ret; } -static __always_inline void * -kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) +static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) { void *ret = kmem_cache_alloc_node(s, gfpflags, node); @@ -502,13 +500,14 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, } #endif /* CONFIG_TRACING */ -extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; +extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment + __malloc; #ifdef CONFIG_TRACING -extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; +extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) + __assume_page_alignment __malloc; #else -static __always_inline void * -kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +static __always_inline void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { return kmalloc_order(size, flags, order); } @@ -638,8 +637,8 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) */ -static __must_check inline void * -krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags) +static inline void * __must_check krealloc_array(void *p, size_t new_n, size_t new_size, + gfp_t flags) { size_t bytes; @@ -668,7 +667,7 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); +extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) @@ -691,7 +690,8 @@ static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) #ifdef CONFIG_NUMA -extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); +extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, + unsigned long caller); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ _RET_IP_) -- cgit v1.2.3 From c37495d6254c237578db3121dcf79857e033f8ff Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:27 -0700 Subject: slab: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for regular kmalloc interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-5-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Reviewed-by: Nick Desaulniers Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Andy Whitcroft Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Tejun Heo Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 61 ++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index d05de03bdcdd..b5bf0537975b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -181,7 +181,7 @@ int kmem_cache_shrink(struct kmem_cache *s); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags); +void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2); void kfree(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -425,7 +425,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size, #define kmalloc_index(s) __kmalloc_index(s, true) #endif /* !CONFIG_SLOB */ -void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; +void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc; void kmem_cache_free(struct kmem_cache *s, void *objp); @@ -449,11 +449,12 @@ static __always_inline void kfree_bulk(size_t size, void **p) } #ifdef CONFIG_NUMA -void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; +void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment + __alloc_size(1); void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; #else -static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __kmalloc(size, flags); } @@ -466,23 +467,23 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f #ifdef CONFIG_TRACING extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) - __assume_slab_alignment __malloc; + __assume_slab_alignment __alloc_size(3); #ifdef CONFIG_NUMA extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment __malloc; + int node, size_t size) __assume_slab_alignment + __alloc_size(4); #else -static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, int node, - size_t size) +static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s, + gfp_t gfpflags, int node, size_t size) { return kmem_cache_alloc_trace(s, gfpflags, size); } #endif /* CONFIG_NUMA */ #else /* CONFIG_TRACING */ -static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, - size_t size) +static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s, + gfp_t flags, size_t size) { void *ret = kmem_cache_alloc(s, flags); @@ -501,19 +502,20 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g #endif /* CONFIG_TRACING */ extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment - __malloc; + __alloc_size(1); #ifdef CONFIG_TRACING extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) - __assume_page_alignment __malloc; + __assume_page_alignment __alloc_size(1); #else -static __always_inline void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, gfp_t flags, + unsigned int order) { return kmalloc_order(size, flags, order); } #endif -static __always_inline void *kmalloc_large(size_t size, gfp_t flags) +static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t flags) { unsigned int order = get_order(size); return kmalloc_order_trace(size, flags, order); @@ -573,7 +575,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * Try really hard to succeed the allocation but fail * eventually. */ -static __always_inline void *kmalloc(size_t size, gfp_t flags) +static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { #ifndef CONFIG_SLOB @@ -595,7 +597,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) return __kmalloc(size, flags); } -static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) { #ifndef CONFIG_SLOB if (__builtin_constant_p(size) && @@ -619,7 +621,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags) { size_t bytes; @@ -637,8 +639,10 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) */ -static inline void * __must_check krealloc_array(void *p, size_t new_n, size_t new_size, - gfp_t flags) +static inline __alloc_size(2, 3) void * __must_check krealloc_array(void *p, + size_t new_n, + size_t new_size, + gfp_t flags) { size_t bytes; @@ -654,7 +658,7 @@ static inline void * __must_check krealloc_array(void *p, size_t new_n, size_t n * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kcalloc(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags) { return kmalloc_array(n, size, flags | __GFP_ZERO); } @@ -667,12 +671,13 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); +extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) + __alloc_size(1); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) -static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, - int node) +static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, + int node) { size_t bytes; @@ -683,7 +688,7 @@ static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, return __kmalloc_node(bytes, flags, node); } -static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) +static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) { return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); } @@ -691,7 +696,7 @@ static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) #ifdef CONFIG_NUMA extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, - unsigned long caller); + unsigned long caller) __alloc_size(1); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ _RET_IP_) @@ -716,7 +721,7 @@ static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags) * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kzalloc(size_t size, gfp_t flags) +static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags) { return kmalloc(size, flags | __GFP_ZERO); } @@ -727,7 +732,7 @@ static inline void *kzalloc(size_t size, gfp_t flags) * @flags: the type of memory to allocate (see kmalloc). * @node: memory node from which to allocate */ -static inline void *kzalloc_node(size_t size, gfp_t flags, int node) +static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node) { return kmalloc_node(size, flags | __GFP_ZERO, node); } -- cgit v1.2.3 From 56bcf40f91c7d7f53b77abe161a7d18ef9f981ba Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:31 -0700 Subject: mm/kvmalloc: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for regular kvmalloc interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-6-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Reviewed-by: Nick Desaulniers Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Andy Whitcroft Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Tejun Heo Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index b5bf0537975b..837cb16232ef 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -737,21 +737,21 @@ static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int n return kmalloc_node(size, flags | __GFP_ZERO, node); } -extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -static inline void *kvmalloc(size_t size, gfp_t flags) +extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1); +static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags) { return kvmalloc_node(size, flags, NUMA_NO_NODE); } -static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) +static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node) { return kvmalloc_node(size, flags | __GFP_ZERO, node); } -static inline void *kvzalloc(size_t size, gfp_t flags) +static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags) { return kvmalloc(size, flags | __GFP_ZERO); } -static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags) { size_t bytes; @@ -761,13 +761,13 @@ static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) return kvmalloc(bytes, flags); } -static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags) { return kvmalloc_array(n, size, flags | __GFP_ZERO); } -extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, - gfp_t flags); +extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) + __alloc_size(3); extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); -- cgit v1.2.3 From 894f24bb569afd4fe4a874c636f82d47f1c9beed Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:34 -0700 Subject: mm/vmalloc: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for appropriate vmalloc allocator interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-7-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Cc: Andy Whitcroft Cc: Christoph Lameter Cc: David Rientjes Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pekka Enberg Cc: Tejun Heo Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 671d402c3778..0ed56fc10c11 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -136,21 +136,21 @@ static inline void vmalloc_init(void) static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif -extern void *vmalloc(unsigned long size); -extern void *vzalloc(unsigned long size); -extern void *vmalloc_user(unsigned long size); -extern void *vmalloc_node(unsigned long size, int node); -extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_32(unsigned long size); -extern void *vmalloc_32_user(unsigned long size); -extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); +extern void *vmalloc(unsigned long size) __alloc_size(1); +extern void *vzalloc(unsigned long size) __alloc_size(1); +extern void *vmalloc_user(unsigned long size) __alloc_size(1); +extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); +extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); +extern void *vmalloc_32(unsigned long size) __alloc_size(1); +extern void *vmalloc_32_user(unsigned long size) __alloc_size(1); +extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, - const void *caller); + const void *caller) __alloc_size(1); void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, - int node, const void *caller); -void *vmalloc_no_huge(unsigned long size); + int node, const void *caller) __alloc_size(1); +void *vmalloc_no_huge(unsigned long size) __alloc_size(1); extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); -- cgit v1.2.3 From abd58f38dfb4771ef06e25d71a84aa0932c52f1a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:38 -0700 Subject: mm/page_alloc: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for appropriate page allocator interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-8-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Cc: Andy Whitcroft Cc: Christoph Lameter Cc: David Rientjes Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pekka Enberg Cc: Tejun Heo Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 55b2ec1f965a..fbd4abc33f24 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -608,9 +608,9 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); -void *alloc_pages_exact(size_t size, gfp_t gfp_mask); +void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); void free_pages_exact(void *virt, size_t size); -void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); +__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(1); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) -- cgit v1.2.3 From 17197dd460469c6fca66e274f5cd51ee43bb6ddd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:42 -0700 Subject: percpu: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for appropriate percpu allocator interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Note that due to the implementation of the percpu API, this is unlikely to ever actually provide compile-time checking beyond very simple non-SMP builds. But, since they are technically allocators, mark them as such. Link: https://lkml.kernel.org/r/20210930222704.2631604-9-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Acked-by: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Andy Whitcroft Cc: David Rientjes Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pekka Enberg Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/percpu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 5e76af742c80..98a9371133f8 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif -extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); +extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1); extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); extern bool is_kernel_percpu_address(unsigned long addr); @@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); extern void __init setup_per_cpu_areas(void); #endif -extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); -extern void __percpu *__alloc_percpu(size_t size, size_t align); +extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); +extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); extern void free_percpu(void __percpu *__pdata); extern phys_addr_t per_cpu_ptr_to_phys(void *addr); -- cgit v1.2.3 From 0b3ea0926afb8dde70cfab00316ae0a70b93a7cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:36:58 -0700 Subject: fs: explicitly unregister per-superblock BDIs Add a new SB_I_ flag to mark superblocks that have an ephemeral bdi associated with them, and unregister it when the superblock is shut down. Link: https://lkml.kernel.org/r/20211021124441.668816-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e7a633353fd2..226de651f52e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1443,6 +1443,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ +#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ /* Possible states of 'frozen' field */ enum { -- cgit v1.2.3 From efee17134ca464639a2f5b4d036ce40caf1b247a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:37:04 -0700 Subject: mm: simplify bdi refcounting Move grabbing and releasing the bdi refcount out of the common wb_init/wb_exit helpers into code that is only used for the non-default memcg driven bdi_writeback structures. [hch@lst.de: add comment] Link: https://lkml.kernel.org/r/20211027074207.GA12793@lst.de [akpm@linux-foundation.org: fix typo] Link: https://lkml.kernel.org/r/20211021124441.668816-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 33207004cfde..993c5628a726 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -103,6 +103,9 @@ struct wb_completion { * change as blkcg is disabled and enabled higher up in the hierarchy, a wb * is tested for blkcg after lookup and removed from index on mismatch so * that a new wb for the combination can be created. + * + * Each bdi_writeback that is not embedded into the backing_dev_info must hold + * a reference to the parent backing_dev_info. See cgwb_create() for details. */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ -- cgit v1.2.3 From e80216d9f1f5c90b3cab834cd4fb492d731f70aa Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 5 Nov 2021 13:37:56 -0700 Subject: mm: memcontrol: remove the kmem states Now the kmem states is only used to indicate whether the kmem is offline. However, we can set ->kmemcg_id to -1 to indicate whether the kmem is offline. Finally, we can remove the kmem states to simplify the code. Link: https://lkml.kernel.org/r/20211025125259.56624-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Shakeel Butt Cc: Matthew Wilcox (Oracle) Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3096c9a0ee01..f207f98bdb76 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -180,12 +180,6 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; -enum memcg_kmem_state { - KMEM_NONE, - KMEM_ALLOCATED, - KMEM_ONLINE, -}; - #if defined(CONFIG_SMP) struct memcg_padding { char x[0]; @@ -318,7 +312,6 @@ struct mem_cgroup { #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; - enum memcg_kmem_state kmem_state; struct obj_cgroup __rcu *objcg; struct list_head objcg_list; /* list of inherited objcgs */ #endif -- cgit v1.2.3 From f1dc0db296bd25960273649fc6ef2ecbf5aaa0e0 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Fri, 5 Nov 2021 13:38:15 -0700 Subject: mm: use __pfn_to_section() instead of open coding it It is defined in the same file just a few lines above. Link: https://lkml.kernel.org/r/4598487.Rc0NezkW7i@mobilepool36.emlix.com Signed-off-by: Rolf Eike Beer Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6a1d79d84675..832aa49d0d8e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1481,7 +1481,7 @@ static inline int pfn_valid(unsigned long pfn) if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - ms = __nr_to_section(pfn_to_section_nr(pfn)); + ms = __pfn_to_section(pfn); if (!valid_section(ms)) return 0; /* @@ -1496,7 +1496,7 @@ static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - return present_section(__nr_to_section(pfn_to_section_nr(pfn))); + return present_section(__pfn_to_section(pfn)); } static inline unsigned long next_present_section_nr(unsigned long section_nr) -- cgit v1.2.3 From 232a6a1c0619d1b4d9cd8d21949b2f13821be0af Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:38:31 -0700 Subject: mm: drop first_index/last_index in zap_details The first_index/last_index parameters in zap_details are actually only used in unmap_mapping_range_tree(). At the meantime, this function is only called by unmap_mapping_pages() once. Instead of passing these two variables through the whole stack of page zapping code, remove them from zap_details and let them simply be parameters of unmap_mapping_range_tree(), which is inlined. Link: https://lkml.kernel.org/r/20210915181535.11238-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Alistair Popple Reviewed-by: David Hildenbrand Reviewed-by: Liam Howlett Acked-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Rapoport Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 32b2ecc47408..c5b600d7f85b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1688,8 +1688,6 @@ extern void user_shm_unlock(size_t, struct ucounts *); */ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ - pgoff_t first_index; /* Lowest page->index to unmap */ - pgoff_t last_index; /* Highest page->index to unmap */ struct page *single_page; /* Locked page to be unmapped */ }; -- cgit v1.2.3 From 91b61ef333cf43f96b3522a086c9ac925763d6e5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:38:34 -0700 Subject: mm: add zap_skip_check_mapping() helper Use the helper for the checks. Rename "check_mapping" into "zap_mapping" because "check_mapping" looks like a bool but in fact it stores the mapping itself. When it's set, we check the mapping (it must be non-NULL). When it's cleared we skip the check, which works like the old way. Move the duplicated comments to the helper too. Link: https://lkml.kernel.org/r/20210915181538.11288-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Rapoport Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c5b600d7f85b..6e29deb1e0d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1687,10 +1687,24 @@ extern void user_shm_unlock(size_t, struct ucounts *); * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { - struct address_space *check_mapping; /* Check page->mapping if set */ + struct address_space *zap_mapping; /* Check page->mapping if set */ struct page *single_page; /* Locked page to be unmapped */ }; +/* + * We set details->zap_mappings when we want to unmap shared but keep private + * pages. Return true if skip zapping this page, false otherwise. + */ +static inline bool +zap_skip_check_mapping(struct zap_details *details, struct page *page) +{ + if (!details || !page) + return false; + + return details->zap_mapping && + (details->zap_mapping != page_rmapping(page)); +} + struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, -- cgit v1.2.3 From e26e0cc30b48cad5f2704f6a22fa5cac9fdaaece Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 5 Nov 2021 13:39:00 -0700 Subject: memory: remove unused CONFIG_MEM_BLOCK_SIZE Commit 3947be1969a9 ("[PATCH] memory hotplug: sysfs and add/remove functions") defines CONFIG_MEM_BLOCK_SIZE, but this has never been utilized anywhere. It is a good practice to keep the CONFIG_* defines exclusively for the Kbuild system. So, drop this unused definition. This issue was noticed due to running ./scripts/checkkconfigsymbols.py. Link: https://lkml.kernel.org/r/20211006120354.7468-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 182c606adb06..053a530c7bdd 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -140,7 +140,6 @@ typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func); -#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< Date: Fri, 5 Nov 2021 13:39:10 -0700 Subject: include/linux/io-mapping.h: remove fallback for writecombine The fallback was introduced in commit 80c33624e472 ("io-mapping: Fixup for different names of writecombine") to fix the build on microblaze. 5 years later, it seems all archs now provide a pgprot_writecombine(), so just remove the other possible fallbacks. For microblaze, pgprot_writecombine() is available since commit 97ccedd793ac ("microblaze: Provide pgprot_device/writecombine macros for nommu"). This is build-tested on microblaze with a hack to always build mm/io-mapping.o and without DIYing on an x86-only macro (_PAGE_CACHE_MASK) Link: https://lkml.kernel.org/r/20211020204838.1142908-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi Cc: Chris Wilson Cc: Daniel Vetter Cc: Joonas Lahtinen Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io-mapping.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index e9743cfd8585..66a774d2710e 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -132,13 +132,7 @@ io_mapping_init_wc(struct io_mapping *iomap, iomap->base = base; iomap->size = size; -#if defined(pgprot_noncached_wc) /* archs can't agree on a name ... */ - iomap->prot = pgprot_noncached_wc(PAGE_KERNEL); -#elif defined(pgprot_writecombine) iomap->prot = pgprot_writecombine(PAGE_KERNEL); -#else - iomap->prot = pgprot_noncached(PAGE_KERNEL); -#endif return iomap; } -- cgit v1.2.3 From bd1a8fb2d43f7c293383f76691d7a55f7f89d9da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Nov 2021 13:39:22 -0700 Subject: mm/vmalloc: don't allow VM_NO_GUARD on vmap() The vmalloc guard pages are added on top of each allocation, thereby isolating any two allocations from one another. The top guard of the lower allocation is the bottom guard guard of the higher allocation etc. Therefore VM_NO_GUARD is dangerous; it breaks the basic premise of isolating separate allocations. There are only two in-tree users of this flag, neither of which use it through the exported interface. Ensure it stays this way. Link: https://lkml.kernel.org/r/YUMfdA36fuyZ+/xt@hirez.programming.kicks-ass.net Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Christoph Hellwig Reviewed-by: David Hildenbrand Acked-by: Will Deacon Acked-by: Kees Cook Cc: Andrey Konovalov Cc: Mel Gorman Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0ed56fc10c11..6e022cc712e6 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -22,7 +22,7 @@ struct notifier_block; /* in notifier.h */ #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ #define VM_DMA_COHERENT 0x00000010 /* dma_alloc_coherent */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ -#define VM_NO_GUARD 0x00000040 /* don't add guard page */ +#define VM_NO_GUARD 0x00000040 /* ***DANGEROUS*** don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ -- cgit v1.2.3 From 3252b1d8309ea42bc6329d9341072ecf1c9505c0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 5 Nov 2021 13:39:47 -0700 Subject: kasan: arm64: fix pcpu_page_first_chunk crash with KASAN_VMALLOC With KASAN_VMALLOC and NEED_PER_CPU_PAGE_FIRST_CHUNK the kernel crashes: Unable to handle kernel paging request at virtual address ffff7000028f2000 ... swapper pgtable: 64k pages, 48-bit VAs, pgdp=0000000042440000 [ffff7000028f2000] pgd=000000063e7c0003, p4d=000000063e7c0003, pud=000000063e7c0003, pmd=000000063e7b0003, pte=0000000000000000 Internal error: Oops: 96000007 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.13.0-rc4-00003-gc6e6e28f3f30-dirty #62 Hardware name: linux,dummy-virt (DT) pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO BTYPE=--) pc : kasan_check_range+0x90/0x1a0 lr : memcpy+0x88/0xf4 sp : ffff80001378fe20 ... Call trace: kasan_check_range+0x90/0x1a0 pcpu_page_first_chunk+0x3f0/0x568 setup_per_cpu_areas+0xb8/0x184 start_kernel+0x8c/0x328 The vm area used in vm_area_register_early() has no kasan shadow memory, Let's add a new kasan_populate_early_vm_area_shadow() function to populate the vm area shadow memory to fix the issue. [wangkefeng.wang@huawei.com: fix redefinition of 'kasan_populate_early_vm_area_shadow'] Link: https://lkml.kernel.org/r/20211011123211.3936196-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20210910053354.26721-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Marco Elver [KASAN] Acked-by: Andrey Konovalov [KASAN] Acked-by: Catalin Marinas Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Greg Kroah-Hartman Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 736d7b458996..65487a3b2a71 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -436,6 +436,8 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); +void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); + #else /* CONFIG_KASAN_VMALLOC */ static inline int kasan_populate_vmalloc(unsigned long start, @@ -453,6 +455,10 @@ static inline void kasan_release_vmalloc(unsigned long start, unsigned long free_region_start, unsigned long free_region_end) {} +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ } + #endif /* CONFIG_KASAN_VMALLOC */ #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ -- cgit v1.2.3 From c00b6b9610991c042ff4c3153daaa3ea8522c210 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Fri, 5 Nov 2021 13:39:53 -0700 Subject: mm/vmalloc: introduce alloc_pages_bulk_array_mempolicy to accelerate memory allocation Commit ffb29b1c255a ("mm/vmalloc: fix numa spreading for large hash tables") can cause significant performance regressions in some situations as Andrew mentioned in [1]. The main situation is vmalloc, vmalloc will allocate pages with NUMA_NO_NODE by default, that will result in alloc page one by one; In order to solve this, __alloc_pages_bulk and mempolicy should be considered at the same time. 1) If node is specified in memory allocation request, it will alloc all pages by __alloc_pages_bulk. 2) If interleaving allocate memory, it will cauculate how many pages should be allocated in each node, and use __alloc_pages_bulk to alloc pages in each node. [1]: https://lore.kernel.org/lkml/CALvZod4G3SzP3kWxQYn0fj+VgG-G3yWXz=gz17+3N57ru1iajw@mail.gmail.com/t/#m750c8e3231206134293b089feaa090590afa0f60 [akpm@linux-foundation.org: coding style fixes] [akpm@linux-foundation.org: make two functions static] [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Link: https://lkml.kernel.org/r/20211021080744.874701-3-chenwandun@huawei.com Signed-off-by: Chen Wandun Reviewed-by: Uladzislau Rezki (Sony) Cc: Eric Dumazet Cc: Shakeel Butt Cc: Nicholas Piggin Cc: Kefeng Wang Cc: Hanjun Guo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fbd4abc33f24..c1b262725dca 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -535,6 +535,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct list_head *page_list, struct page **page_array); +unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, + unsigned long nr_pages, + struct page **page_array); + /* Bulk allocate order-0 pages */ static inline unsigned long alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) -- cgit v1.2.3 From 8ca1b5a49885f0c0c486544da46a9e0ac790831d Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 5 Nov 2021 13:40:34 -0700 Subject: mm/page_alloc: detect allocation forbidden by cpuset and bail out early There was a report that starting an Ubuntu in docker while using cpuset to bind it to movable nodes (a node only has movable zone, like a node for hotplug or a Persistent Memory node in normal usage) will fail due to memory allocation failure, and then OOM is involved and many other innocent processes got killed. It can be reproduced with command: $ docker run -it --rm --cpuset-mems 4 ubuntu:latest bash -c "grep Mems_allowed /proc/self/status" (where node 4 is a movable node) runc:[2:INIT] invoked oom-killer: gfp_mask=0x500cc2(GFP_HIGHUSER|__GFP_ACCOUNT), order=0, oom_score_adj=0 CPU: 8 PID: 8291 Comm: runc:[2:INIT] Tainted: G W I E 5.8.2-0.g71b519a-default #1 openSUSE Tumbleweed (unreleased) Hardware name: Dell Inc. PowerEdge R640/0PHYDR, BIOS 2.6.4 04/09/2020 Call Trace: dump_stack+0x6b/0x88 dump_header+0x4a/0x1e2 oom_kill_process.cold+0xb/0x10 out_of_memory.part.0+0xaf/0x230 out_of_memory+0x3d/0x80 __alloc_pages_slowpath.constprop.0+0x954/0xa20 __alloc_pages_nodemask+0x2d3/0x300 pipe_write+0x322/0x590 new_sync_write+0x196/0x1b0 vfs_write+0x1c3/0x1f0 ksys_write+0xa7/0xe0 do_syscall_64+0x52/0xd0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Mem-Info: active_anon:392832 inactive_anon:182 isolated_anon:0 active_file:68130 inactive_file:151527 isolated_file:0 unevictable:2701 dirty:0 writeback:7 slab_reclaimable:51418 slab_unreclaimable:116300 mapped:45825 shmem:735 pagetables:2540 bounce:0 free:159849484 free_pcp:73 free_cma:0 Node 4 active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB all_unreclaimable? no Node 4 Movable free:130021408kB min:9140kB low:139160kB high:269180kB reserved_highatomic:0KB active_anon:1448kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:130023424kB managed:130023424kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:292kB local_pcp:84kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 0 Node 4 Movable: 1*4kB (M) 0*8kB 0*16kB 1*32kB (M) 0*64kB 0*128kB 1*256kB (M) 1*512kB (M) 1*1024kB (M) 0*2048kB 31743*4096kB (M) = 130021156kB oom-kill:constraint=CONSTRAINT_CPUSET,nodemask=(null),cpuset=docker-9976a269caec812c134fa317f27487ee36e1129beba7278a463dd53e5fb9997b.scope,mems_allowed=4,global_oom,task_memcg=/system.slice/containerd.service,task=containerd,pid=4100,uid=0 Out of memory: Killed process 4100 (containerd) total-vm:4077036kB, anon-rss:51184kB, file-rss:26016kB, shmem-rss:0kB, UID:0 pgtables:676kB oom_score_adj:0 oom_reaper: reaped process 8248 (docker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB oom_reaper: reaped process 2054 (node_exporter), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB oom_reaper: reaped process 1452 (systemd-journal), now anon-rss:0kB, file-rss:8564kB, shmem-rss:4kB oom_reaper: reaped process 2146 (munin-node), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB oom_reaper: reaped process 8291 (runc:[2:INIT]), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB The reason is that in this case, the target cpuset nodes only have movable zone, while the creation of an OS in docker sometimes needs to allocate memory in non-movable zones (dma/dma32/normal) like GFP_HIGHUSER, and the cpuset limit forbids the allocation, then out-of-memory killing is involved even when normal nodes and movable nodes both have many free memory. The OOM killer cannot help to resolve the situation as there is no usable memory for the request in the cpuset scope. The only reasonable measure to take is to fail the allocation right away and have the caller to deal with it. So add a check for cases like this in the slowpath of allocation, and bail out early returning NULL for the allocation. As page allocation is one of the hottest path in kernel, this check will hurt all users with sane cpuset configuration, add a static branch check and detect the abnormal config in cpuset memory binding setup so that the extra check cost in page allocation is not paid by everyone. [thanks to Micho Hocko and David Rientjes for suggesting not handling it inside OOM code, adding cpuset check, refining comments] Link: https://lkml.kernel.org/r/1632481657-68112-1-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: David Rientjes Cc: Tejun Heo Cc: Zefan Li Cc: Johannes Weiner Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 17 +++++++++++++++++ include/linux/mmzone.h | 22 ++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index d2b9c41c8edf..d58e0476ee8e 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -34,6 +34,8 @@ */ extern struct static_key_false cpusets_pre_enable_key; extern struct static_key_false cpusets_enabled_key; +extern struct static_key_false cpusets_insane_config_key; + static inline bool cpusets_enabled(void) { return static_branch_unlikely(&cpusets_enabled_key); @@ -51,6 +53,19 @@ static inline void cpuset_dec(void) static_branch_dec_cpuslocked(&cpusets_pre_enable_key); } +/* + * This will get enabled whenever a cpuset configuration is considered + * unsupportable in general. E.g. movable only node which cannot satisfy + * any non movable allocations (see update_nodemask). Page allocator + * needs to make additional checks for those configurations and this + * check is meant to guard those checks without any overhead for sane + * configurations. + */ +static inline bool cpusets_insane_config(void) +{ + return static_branch_unlikely(&cpusets_insane_config_key); +} + extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); @@ -167,6 +182,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) static inline bool cpusets_enabled(void) { return false; } +static inline bool cpusets_insane_config(void) { return false; } + static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 832aa49d0d8e..fb36a29e3aae 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1220,6 +1220,28 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #define for_each_zone_zonelist(zone, z, zlist, highidx) \ for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) +/* Whether the 'nodes' are all movable nodes */ +static inline bool movable_only_nodes(nodemask_t *nodes) +{ + struct zonelist *zonelist; + struct zoneref *z; + int nid; + + if (nodes_empty(*nodes)) + return false; + + /* + * We can chose arbitrary node from the nodemask to get a + * zonelist as they are interlinked. We just need to find + * at least one zone that can satisfy kernel allocations. + */ + nid = first_node(*nodes); + zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; + z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); + return (!z->zone) ? true : false; +} + + #ifdef CONFIG_SPARSEMEM #include #endif -- cgit v1.2.3 From d2635f2012a44e3d469ab9a4022162dbe0e53f21 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 5 Nov 2021 13:40:40 -0700 Subject: mm: create a new system state and fix core_kernel_text() core_kernel_text() considers that until system_state in at least SYSTEM_RUNNING, init memory is valid. But init memory is freed a few lines before setting SYSTEM_RUNNING, so we have a small period of time when core_kernel_text() is wrong. Create an intermediate system state called SYSTEM_FREEING_INIT that is set before starting freeing init memory, and use it in core_kernel_text() to report init memory invalid earlier. Link: https://lkml.kernel.org/r/9ecfdee7dd4d741d172cb93ff1d87f1c58127c9a.1633001016.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Gerald Schaefer Cc: Kefeng Wang Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2776423a587e..471bc0593679 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -248,6 +248,7 @@ extern bool early_boot_irqs_disabled; extern enum system_states { SYSTEM_BOOTING, SYSTEM_SCHEDULING, + SYSTEM_FREEING_INITMEM, SYSTEM_RUNNING, SYSTEM_HALT, SYSTEM_POWER_OFF, -- cgit v1.2.3 From 477d01fce8dacb41cae9363136ea56812e8baa59 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:40:58 -0700 Subject: mm: fix data race in PagePoisoned() PagePoisoned() accesses page->flags which can be updated concurrently: | BUG: KCSAN: data-race in next_uptodate_page / unlock_page | | write (marked) to 0xffffea00050f37c0 of 8 bytes by task 1872 on cpu 1: | instrument_atomic_write include/linux/instrumented.h:87 [inline] | clear_bit_unlock_is_negative_byte include/asm-generic/bitops/instrumented-lock.h:74 [inline] | unlock_page+0x102/0x1b0 mm/filemap.c:1465 | filemap_map_pages+0x6c6/0x890 mm/filemap.c:3057 | ... | read to 0xffffea00050f37c0 of 8 bytes by task 1873 on cpu 0: | PagePoisoned include/linux/page-flags.h:204 [inline] | PageReadahead include/linux/page-flags.h:382 [inline] | next_uptodate_page+0x456/0x830 mm/filemap.c:2975 | ... | CPU: 0 PID: 1873 Comm: systemd-udevd Not tainted 5.11.0-rc4-00001-gf9ce0be71d1f #1 To avoid the compiler tearing or otherwise optimizing the access, use READ_ONCE() to access flags. Link: https://lore.kernel.org/all/20210826144157.GA26950@xsang-OptiPlex-9020/ Link: https://lkml.kernel.org/r/20210913113542.2658064-1-elver@google.com Reported-by: kernel test robot Signed-off-by: Marco Elver Acked-by: Kirill A. Shutemov Acked-by: Will Deacon Cc: Marco Elver Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index fbfd3fad48f2..dd80cf0bb579 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -215,7 +215,7 @@ static __always_inline int PageCompound(struct page *page) #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { - return page->flags == PAGE_POISON_PATTERN; + return READ_ONCE(page->flags) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM -- cgit v1.2.3 From 73c54763482b841d9c14bad87eec98f80f700e0b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:41:17 -0700 Subject: mm/hugetlb: drop __unmap_hugepage_range definition from hugetlb.h Remove __unmap_hugepage_range() from the header file, because it is only used in hugetlb.c. Link: https://lkml.kernel.org/r/20210917165108.9341-1-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Mike Kravetz Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1faebe1cd0ed..3cbf60464398 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -143,9 +143,6 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page); -void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct page *ref_page); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); void hugetlb_show_meminfo(void); @@ -385,13 +382,6 @@ static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, BUG(); } -static inline void __unmap_hugepage_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) -{ - BUG(); -} - static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) -- cgit v1.2.3 From 79dfc695525f60c8410015362b8aa4eb5c57210c Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Nov 2021 13:41:20 -0700 Subject: hugetlb: add demote hugetlb page sysfs interfaces Patch series "hugetlb: add demote/split page functionality", v4. The concurrent use of multiple hugetlb page sizes on a single system is becoming more common. One of the reasons is better TLB support for gigantic page sizes on x86 hardware. In addition, hugetlb pages are being used to back VMs in hosting environments. When using hugetlb pages to back VMs, it is often desirable to preallocate hugetlb pools. This avoids the delay and uncertainty of allocating hugetlb pages at VM startup. In addition, preallocating huge pages minimizes the issue of memory fragmentation that increases the longer the system is up and running. In such environments, a combination of larger and smaller hugetlb pages are preallocated in anticipation of backing VMs of various sizes. Over time, the preallocated pool of smaller hugetlb pages may become depleted while larger hugetlb pages still remain. In such situations, it is desirable to convert larger hugetlb pages to smaller hugetlb pages. Converting larger to smaller hugetlb pages can be accomplished today by first freeing the larger page to the buddy allocator and then allocating the smaller pages. For example, to convert 50 GB pages on x86: gb_pages=`cat .../hugepages-1048576kB/nr_hugepages` m2_pages=`cat .../hugepages-2048kB/nr_hugepages` echo $(($gb_pages - 50)) > .../hugepages-1048576kB/nr_hugepages echo $(($m2_pages + 25600)) > .../hugepages-2048kB/nr_hugepages On an idle system this operation is fairly reliable and results are as expected. The number of 2MB pages is increased as expected and the time of the operation is a second or two. However, when there is activity on the system the following issues arise: 1) This process can take quite some time, especially if allocation of the smaller pages is not immediate and requires migration/compaction. 2) There is no guarantee that the total size of smaller pages allocated will match the size of the larger page which was freed. This is because the area freed by the larger page could quickly be fragmented. In a test environment with a load that continually fills the page cache with clean pages, results such as the following can be observed: Unexpected number of 2MB pages allocated: Expected 25600, have 19944 real 0m42.092s user 0m0.008s sys 0m41.467s To address these issues, introduce the concept of hugetlb page demotion. Demotion provides a means of 'in place' splitting of a hugetlb page to pages of a smaller size. This avoids freeing pages to buddy and then trying to allocate from buddy. Page demotion is controlled via sysfs files that reside in the per-hugetlb page size and per node directories. - demote_size Target page size for demotion, a smaller huge page size. File can be written to chose a smaller huge page size if multiple are available. - demote Writable number of hugetlb pages to be demoted To demote 50 GB huge pages, one would: cat .../hugepages-1048576kB/free_hugepages /* optional, verify free pages */ cat .../hugepages-1048576kB/demote_size /* optional, verify target size */ echo 50 > .../hugepages-1048576kB/demote Only hugetlb pages which are free at the time of the request can be demoted. Demotion does not add to the complexity of surplus pages and honors reserved huge pages. Therefore, when a value is written to the sysfs demote file, that value is only the maximum number of pages which will be demoted. It is possible fewer will actually be demoted. The recently introduced per-hstate mutex is used to synchronize demote operations with other operations that modify hugetlb pools. Real world use cases -------------------- The above scenario describes a real world use case where hugetlb pages are used to back VMs on x86. Both issues of long allocation times and not necessarily getting the expected number of smaller huge pages after a free and allocate cycle have been experienced. The occurrence of these issues is dependent on other activity within the host and can not be predicted. This patch (of 5): Two new sysfs files are added to demote hugtlb pages. These files are both per-hugetlb page size and per node. Files are: demote_size - The size in Kb that pages are demoted to. (read-write) demote - The number of huge pages to demote. (write-only) By default, demote_size is the next smallest huge page size. Valid huge page sizes less than huge page size may be written to this file. When huge pages are demoted, they are demoted to this size. Writing a value to demote will result in an attempt to demote that number of hugetlb pages to an appropriate number of demote_size pages. NOTE: Demote interfaces are only provided for huge page sizes if there is a smaller target demote huge page size. For example, on x86 1GB huge pages will have demote interfaces. 2MB huge pages will not have demote interfaces. This patch does not provide full demote functionality. It only provides the sysfs interfaces. It also provides documentation for the new interfaces. [mike.kravetz@oracle.com: n_mask initialization does not need to be protected by the mutex] Link: https://lkml.kernel.org/r/0530e4ef-2492-5186-f919-5db68edea654@oracle.com Link: https://lkml.kernel.org/r/20211007181918.136982-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Michal Hocko Cc: Zi Yan Cc: Muchun Song Cc: Naoya Horiguchi Cc: David Rientjes Cc: "Aneesh Kumar K . V" Cc: Nghia Le Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3cbf60464398..2bddd6c38204 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -586,6 +586,7 @@ struct hstate { int next_nid_to_alloc; int next_nid_to_free; unsigned int order; + unsigned int demote_order; unsigned long mask; unsigned long max_huge_pages; unsigned long nr_huge_pages; -- cgit v1.2.3 From 9871e2ded6c1ff61a59988d7a0e975f012105d52 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Nov 2021 13:41:23 -0700 Subject: mm/cma: add cma_pages_valid to determine if pages are in CMA Add new interface cma_pages_valid() which indicates if the specified pages are part of a CMA region. This interface will be used in a subsequent patch by hugetlb code. In order to keep the same amount of DEBUG information, a pr_debug() call was added to cma_pages_valid(). In the case where the page passed to cma_release is not in cma region, the debug message will be printed from cma_pages_valid as opposed to cma_release. Link: https://lkml.kernel.org/r/20211007181918.136982-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: David Rientjes Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nghia Le Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cma.h b/include/linux/cma.h index 53fd8c3cdbd0..bd801023504b 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -46,6 +46,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); +extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); -- cgit v1.2.3 From bd3400ea173fb611cdf2030d03620185ff6c0b0e Mon Sep 17 00:00:00 2001 From: Liangcai Fan Date: Fri, 5 Nov 2021 13:41:36 -0700 Subject: mm: khugepaged: recalculate min_free_kbytes after stopping khugepaged When initializing transparent huge pages, min_free_kbytes would be calculated according to what khugepaged expected. So when transparent huge pages get disabled, min_free_kbytes should be recalculated instead of the higher value set by khugepaged. Link: https://lkml.kernel.org/r/1633937809-16558-1-git-send-email-liangcaifan19@gmail.com Signed-off-by: Liangcai Fan Signed-off-by: Chunyan Zhang Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6e29deb1e0d4..7e37c726b9db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2453,6 +2453,7 @@ extern void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); +extern void calculate_min_free_kbytes(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); -- cgit v1.2.3 From 550a7d60bd5e35a56942dba6d8a26752beb26c9f Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 5 Nov 2021 13:41:40 -0700 Subject: mm, hugepages: add mremap() support for hugepage backed vma Support mremap() for hugepage backed vma segment by simply repositioning page table entries. The page table entries are repositioned to the new virtual address on mremap(). Hugetlb mremap() support is of course generic; my motivating use case is a library (hugepage_text), which reloads the ELF text of executables in hugepages. This significantly increases the execution performance of said executables. Restrict the mremap operation on hugepages to up to the size of the original mapping as the underlying hugetlb reservation is not yet capable of handling remapping to a larger size. During the mremap() operation we detect pmd_share'd mappings and we unshare those during the mremap(). On access and fault the sharing is established again. Link: https://lkml.kernel.org/r/20211013195825.3058275-1-almasrymina@google.com Signed-off-by: Mina Almasry Reviewed-by: Mike Kravetz Cc: Ken Chen Cc: Chris Kennelly Cc: Michal Hocko Cc: Vlastimil Babka Cc: Kirill Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2bddd6c38204..c0a20781b28e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -124,6 +124,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); +void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, loff_t *); @@ -132,6 +133,10 @@ int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *, int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int move_hugetlb_page_tables(struct vm_area_struct *vma, + struct vm_area_struct *new_vma, + unsigned long old_addr, unsigned long new_addr, + unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, @@ -215,6 +220,10 @@ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { } +static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma) +{ +} + static inline unsigned long hugetlb_total_pages(void) { return 0; @@ -262,6 +271,16 @@ static inline int copy_hugetlb_page_range(struct mm_struct *dst, return 0; } +static inline int move_hugetlb_page_tables(struct vm_area_struct *vma, + struct vm_area_struct *new_vma, + unsigned long old_addr, + unsigned long new_addr, + unsigned long len) +{ + BUG(); + return 0; +} + static inline void hugetlb_report_meminfo(struct seq_file *m) { } -- cgit v1.2.3 From 8cd7c588decf470bf7e14f2be93b709f839a965e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Nov 2021 13:42:25 -0700 Subject: mm/vmscan: throttle reclaim until some writeback completes if congested Patch series "Remove dependency on congestion_wait in mm/", v5. This series that removes all calls to congestion_wait in mm/ and deletes wait_iff_congested. It's not a clever implementation but congestion_wait has been broken for a long time [1]. Even if congestion throttling worked, it was never a great idea. While excessive dirty/writeback pages at the tail of the LRU is one possibility that reclaim may be slow, there is also the problem of too many pages being isolated and reclaim failing for other reasons (elevated references, too many pages isolated, excessive LRU contention etc). This series replaces the "congestion" throttling with 3 different types. - If there are too many dirty/writeback pages, sleep until a timeout or enough pages get cleaned - If too many pages are isolated, sleep until enough isolated pages are either reclaimed or put back on the LRU - If no progress is being made, direct reclaim tasks sleep until another task makes progress with acceptable efficiency. This was initially tested with a mix of workloads that used to trigger corner cases that no longer work. A new test case was created called "stutterp" (pagereclaim-stutterp-noreaders in mmtests) using a freshly created XFS filesystem. Note that it may be necessary to increase the timeout of ssh if executing remotely as ssh itself can get throttled and the connection may timeout. stutterp varies the number of "worker" processes from 4 up to NR_CPUS*4 to check the impact as the number of direct reclaimers increase. It has four types of worker. - One "anon latency" worker creates small mappings with mmap() and times how long it takes to fault the mapping reading it 4K at a time - X file writers which is fio randomly writing X files where the total size of the files add up to the allowed dirty_ratio. fio is allowed to run for a warmup period to allow some file-backed pages to accumulate. The duration of the warmup is based on the best-case linear write speed of the storage. - Y file readers which is fio randomly reading small files - Z anon memory hogs which continually map (100-dirty_ratio)% of memory - Total estimated WSS = (100+dirty_ration) percentage of memory X+Y+Z+1 == NR_WORKERS varying from 4 up to NR_CPUS*4 The intent is to maximise the total WSS with a mix of file and anon memory where some anonymous memory must be swapped and there is a high likelihood of dirty/writeback pages reaching the end of the LRU. The test can be configured to have no background readers to stress dirty/writeback pages. The results below are based on having zero readers. The short summary of the results is that the series works and stalls until some event occurs but the timeouts may need adjustment. The test results are not broken down by patch as the series should be treated as one block that replaces a broken throttling mechanism with a working one. Finally, three machines were tested but I'm reporting the worst set of results. The other two machines had much better latencies for example. First the results of the "anon latency" latency stutterp 5.15.0-rc1 5.15.0-rc1 vanilla mm-reclaimcongest-v5r4 Amean mmap-4 31.4003 ( 0.00%) 2661.0198 (-8374.52%) Amean mmap-7 38.1641 ( 0.00%) 149.2891 (-291.18%) Amean mmap-12 60.0981 ( 0.00%) 187.8105 (-212.51%) Amean mmap-21 161.2699 ( 0.00%) 213.9107 ( -32.64%) Amean mmap-30 174.5589 ( 0.00%) 377.7548 (-116.41%) Amean mmap-48 8106.8160 ( 0.00%) 1070.5616 ( 86.79%) Stddev mmap-4 41.3455 ( 0.00%) 27573.9676 (-66591.66%) Stddev mmap-7 53.5556 ( 0.00%) 4608.5860 (-8505.23%) Stddev mmap-12 171.3897 ( 0.00%) 5559.4542 (-3143.75%) Stddev mmap-21 1506.6752 ( 0.00%) 5746.2507 (-281.39%) Stddev mmap-30 557.5806 ( 0.00%) 7678.1624 (-1277.05%) Stddev mmap-48 61681.5718 ( 0.00%) 14507.2830 ( 76.48%) Max-90 mmap-4 31.4243 ( 0.00%) 83.1457 (-164.59%) Max-90 mmap-7 41.0410 ( 0.00%) 41.0720 ( -0.08%) Max-90 mmap-12 66.5255 ( 0.00%) 53.9073 ( 18.97%) Max-90 mmap-21 146.7479 ( 0.00%) 105.9540 ( 27.80%) Max-90 mmap-30 193.9513 ( 0.00%) 64.3067 ( 66.84%) Max-90 mmap-48 277.9137 ( 0.00%) 591.0594 (-112.68%) Max mmap-4 1913.8009 ( 0.00%) 299623.9695 (-15555.96%) Max mmap-7 2423.9665 ( 0.00%) 204453.1708 (-8334.65%) Max mmap-12 6845.6573 ( 0.00%) 221090.3366 (-3129.64%) Max mmap-21 56278.6508 ( 0.00%) 213877.3496 (-280.03%) Max mmap-30 19716.2990 ( 0.00%) 216287.6229 (-997.00%) Max mmap-48 477923.9400 ( 0.00%) 245414.8238 ( 48.65%) For most thread counts, the time to mmap() is unfortunately increased. In earlier versions of the series, this was lower but a large number of throttling events were reaching their timeout increasing the amount of inefficient scanning of the LRU. There is no prioritisation of reclaim tasks making progress based on each tasks rate of page allocation versus progress of reclaim. The variance is also impacted for high worker counts but in all cases, the differences in latency are not statistically significant due to very large maximum outliers. Max-90 shows that 90% of the stalls are comparable but the Max results show the massive outliers which are increased to to stalling. It is expected that this will be very machine dependant. Due to the test design, reclaim is difficult so allocations stall and there are variances depending on whether THPs can be allocated or not. The amount of memory will affect exactly how bad the corner cases are and how often they trigger. The warmup period calculation is not ideal as it's based on linear writes where as fio is randomly writing multiple files from multiple tasks so the start state of the test is variable. For example, these are the latencies on a single-socket machine that had more memory Amean mmap-4 42.2287 ( 0.00%) 49.6838 * -17.65%* Amean mmap-7 216.4326 ( 0.00%) 47.4451 * 78.08%* Amean mmap-12 2412.0588 ( 0.00%) 51.7497 ( 97.85%) Amean mmap-21 5546.2548 ( 0.00%) 51.8862 ( 99.06%) Amean mmap-30 1085.3121 ( 0.00%) 72.1004 ( 93.36%) The overall system CPU usage and elapsed time is as follows 5.15.0-rc3 5.15.0-rc3 vanilla mm-reclaimcongest-v5r4 Duration User 6989.03 983.42 Duration System 7308.12 799.68 Duration Elapsed 2277.67 2092.98 The patches reduce system CPU usage by 89% as the vanilla kernel is rarely stalling. The high-level /proc/vmstats show 5.15.0-rc1 5.15.0-rc1 vanilla mm-reclaimcongest-v5r2 Ops Direct pages scanned 1056608451.00 503594991.00 Ops Kswapd pages scanned 109795048.00 147289810.00 Ops Kswapd pages reclaimed 63269243.00 31036005.00 Ops Direct pages reclaimed 10803973.00 6328887.00 Ops Kswapd efficiency % 57.62 21.07 Ops Kswapd velocity 48204.98 57572.86 Ops Direct efficiency % 1.02 1.26 Ops Direct velocity 463898.83 196845.97 Kswapd scanned less pages but the detailed pattern is different. The vanilla kernel scans slowly over time where as the patches exhibits burst patterns of scan activity. Direct reclaim scanning is reduced by 52% due to stalling. The pattern for stealing pages is also slightly different. Both kernels exhibit spikes but the vanilla kernel when reclaiming shows pages being reclaimed over a period of time where as the patches tend to reclaim in spikes. The difference is that vanilla is not throttling and instead scanning constantly finding some pages over time where as the patched kernel throttles and reclaims in spikes. Ops Percentage direct scans 90.59 77.37 For direct reclaim, vanilla scanned 90.59% of pages where as with the patches, 77.37% were direct reclaim due to throttling Ops Page writes by reclaim 2613590.00 1687131.00 Page writes from reclaim context are reduced. Ops Page writes anon 2932752.00 1917048.00 And there is less swapping. Ops Page reclaim immediate 996248528.00 107664764.00 The number of pages encountered at the tail of the LRU tagged for immediate reclaim but still dirty/writeback is reduced by 89%. Ops Slabs scanned 164284.00 153608.00 Slab scan activity is similar. ftrace was used to gather stall activity Vanilla ------- 1 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=16000 2 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=12000 8 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=8000 29 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=4000 82394 writeback_wait_iff_congested: usec_timeout=100000 usec_delayed=0 The fast majority of wait_iff_congested calls do not stall at all. What is likely happening is that cond_resched() reschedules the task for a short period when the BDI is not registering congestion (which it never will in this test setup). 1 writeback_congestion_wait: usec_timeout=100000 usec_delayed=120000 2 writeback_congestion_wait: usec_timeout=100000 usec_delayed=132000 4 writeback_congestion_wait: usec_timeout=100000 usec_delayed=112000 380 writeback_congestion_wait: usec_timeout=100000 usec_delayed=108000 778 writeback_congestion_wait: usec_timeout=100000 usec_delayed=104000 congestion_wait if called always exceeds the timeout as there is no trigger to wake it up. Bottom line: Vanilla will throttle but it's not effective. Patch series ------------ Kswapd throttle activity was always due to scanning pages tagged for immediate reclaim at the tail of the LRU 1 usec_timeout=100000 usect_delayed=72000 reason=VMSCAN_THROTTLE_WRITEBACK 4 usec_timeout=100000 usect_delayed=20000 reason=VMSCAN_THROTTLE_WRITEBACK 5 usec_timeout=100000 usect_delayed=12000 reason=VMSCAN_THROTTLE_WRITEBACK 6 usec_timeout=100000 usect_delayed=16000 reason=VMSCAN_THROTTLE_WRITEBACK 11 usec_timeout=100000 usect_delayed=100000 reason=VMSCAN_THROTTLE_WRITEBACK 11 usec_timeout=100000 usect_delayed=8000 reason=VMSCAN_THROTTLE_WRITEBACK 94 usec_timeout=100000 usect_delayed=0 reason=VMSCAN_THROTTLE_WRITEBACK 112 usec_timeout=100000 usect_delayed=4000 reason=VMSCAN_THROTTLE_WRITEBACK The majority of events did not stall or stalled for a short period. Roughly 16% of stalls reached the timeout before expiry. For direct reclaim, the number of times stalled for each reason were 6624 reason=VMSCAN_THROTTLE_ISOLATED 93246 reason=VMSCAN_THROTTLE_NOPROGRESS 96934 reason=VMSCAN_THROTTLE_WRITEBACK The most common reason to stall was due to excessive pages tagged for immediate reclaim at the tail of the LRU followed by a failure to make forward. A relatively small number were due to too many pages isolated from the LRU by parallel threads For VMSCAN_THROTTLE_ISOLATED, the breakdown of delays was 9 usec_timeout=20000 usect_delayed=4000 reason=VMSCAN_THROTTLE_ISOLATED 12 usec_timeout=20000 usect_delayed=16000 reason=VMSCAN_THROTTLE_ISOLATED 83 usec_timeout=20000 usect_delayed=20000 reason=VMSCAN_THROTTLE_ISOLATED 6520 usec_timeout=20000 usect_delayed=0 reason=VMSCAN_THROTTLE_ISOLATED Most did not stall at all. A small number reached the timeout. For VMSCAN_THROTTLE_NOPROGRESS, the breakdown of stalls were all over the map 1 usec_timeout=500000 usect_delayed=324000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usec_timeout=500000 usect_delayed=332000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usec_timeout=500000 usect_delayed=348000 reason=VMSCAN_THROTTLE_NOPROGRESS 1 usec_timeout=500000 usect_delayed=360000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=228000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=260000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=340000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=364000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=372000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=428000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=460000 reason=VMSCAN_THROTTLE_NOPROGRESS 2 usec_timeout=500000 usect_delayed=464000 reason=VMSCAN_THROTTLE_NOPROGRESS 3 usec_timeout=500000 usect_delayed=244000 reason=VMSCAN_THROTTLE_NOPROGRESS 3 usec_timeout=500000 usect_delayed=252000 reason=VMSCAN_THROTTLE_NOPROGRESS 3 usec_timeout=500000 usect_delayed=272000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=188000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=268000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=328000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=380000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=392000 reason=VMSCAN_THROTTLE_NOPROGRESS 4 usec_timeout=500000 usect_delayed=432000 reason=VMSCAN_THROTTLE_NOPROGRESS 5 usec_timeout=500000 usect_delayed=204000 reason=VMSCAN_THROTTLE_NOPROGRESS 5 usec_timeout=500000 usect_delayed=220000 reason=VMSCAN_THROTTLE_NOPROGRESS 5 usec_timeout=500000 usect_delayed=412000 reason=VMSCAN_THROTTLE_NOPROGRESS 5 usec_timeout=500000 usect_delayed=436000 reason=VMSCAN_THROTTLE_NOPROGRESS 6 usec_timeout=500000 usect_delayed=488000 reason=VMSCAN_THROTTLE_NOPROGRESS 7 usec_timeout=500000 usect_delayed=212000 reason=VMSCAN_THROTTLE_NOPROGRESS 7 usec_timeout=500000 usect_delayed=300000 reason=VMSCAN_THROTTLE_NOPROGRESS 7 usec_timeout=500000 usect_delayed=316000 reason=VMSCAN_THROTTLE_NOPROGRESS 7 usec_timeout=500000 usect_delayed=472000 reason=VMSCAN_THROTTLE_NOPROGRESS 8 usec_timeout=500000 usect_delayed=248000 reason=VMSCAN_THROTTLE_NOPROGRESS 8 usec_timeout=500000 usect_delayed=356000 reason=VMSCAN_THROTTLE_NOPROGRESS 8 usec_timeout=500000 usect_delayed=456000 reason=VMSCAN_THROTTLE_NOPROGRESS 9 usec_timeout=500000 usect_delayed=124000 reason=VMSCAN_THROTTLE_NOPROGRESS 9 usec_timeout=500000 usect_delayed=376000 reason=VMSCAN_THROTTLE_NOPROGRESS 9 usec_timeout=500000 usect_delayed=484000 reason=VMSCAN_THROTTLE_NOPROGRESS 10 usec_timeout=500000 usect_delayed=172000 reason=VMSCAN_THROTTLE_NOPROGRESS 10 usec_timeout=500000 usect_delayed=420000 reason=VMSCAN_THROTTLE_NOPROGRESS 10 usec_timeout=500000 usect_delayed=452000 reason=VMSCAN_THROTTLE_NOPROGRESS 11 usec_timeout=500000 usect_delayed=256000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=112000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=116000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=144000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=152000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=264000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=384000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=424000 reason=VMSCAN_THROTTLE_NOPROGRESS 12 usec_timeout=500000 usect_delayed=492000 reason=VMSCAN_THROTTLE_NOPROGRESS 13 usec_timeout=500000 usect_delayed=184000 reason=VMSCAN_THROTTLE_NOPROGRESS 13 usec_timeout=500000 usect_delayed=444000 reason=VMSCAN_THROTTLE_NOPROGRESS 14 usec_timeout=500000 usect_delayed=308000 reason=VMSCAN_THROTTLE_NOPROGRESS 14 usec_timeout=500000 usect_delayed=440000 reason=VMSCAN_THROTTLE_NOPROGRESS 14 usec_timeout=500000 usect_delayed=476000 reason=VMSCAN_THROTTLE_NOPROGRESS 16 usec_timeout=500000 usect_delayed=140000 reason=VMSCAN_THROTTLE_NOPROGRESS 17 usec_timeout=500000 usect_delayed=232000 reason=VMSCAN_THROTTLE_NOPROGRESS 17 usec_timeout=500000 usect_delayed=240000 reason=VMSCAN_THROTTLE_NOPROGRESS 17 usec_timeout=500000 usect_delayed=280000 reason=VMSCAN_THROTTLE_NOPROGRESS 18 usec_timeout=500000 usect_delayed=404000 reason=VMSCAN_THROTTLE_NOPROGRESS 20 usec_timeout=500000 usect_delayed=148000 reason=VMSCAN_THROTTLE_NOPROGRESS 20 usec_timeout=500000 usect_delayed=216000 reason=VMSCAN_THROTTLE_NOPROGRESS 20 usec_timeout=500000 usect_delayed=468000 reason=VMSCAN_THROTTLE_NOPROGRESS 21 usec_timeout=500000 usect_delayed=448000 reason=VMSCAN_THROTTLE_NOPROGRESS 23 usec_timeout=500000 usect_delayed=168000 reason=VMSCAN_THROTTLE_NOPROGRESS 23 usec_timeout=500000 usect_delayed=296000 reason=VMSCAN_THROTTLE_NOPROGRESS 25 usec_timeout=500000 usect_delayed=132000 reason=VMSCAN_THROTTLE_NOPROGRESS 25 usec_timeout=500000 usect_delayed=352000 reason=VMSCAN_THROTTLE_NOPROGRESS 26 usec_timeout=500000 usect_delayed=180000 reason=VMSCAN_THROTTLE_NOPROGRESS 27 usec_timeout=500000 usect_delayed=284000 reason=VMSCAN_THROTTLE_NOPROGRESS 28 usec_timeout=500000 usect_delayed=164000 reason=VMSCAN_THROTTLE_NOPROGRESS 29 usec_timeout=500000 usect_delayed=136000 reason=VMSCAN_THROTTLE_NOPROGRESS 30 usec_timeout=500000 usect_delayed=200000 reason=VMSCAN_THROTTLE_NOPROGRESS 30 usec_timeout=500000 usect_delayed=400000 reason=VMSCAN_THROTTLE_NOPROGRESS 31 usec_timeout=500000 usect_delayed=196000 reason=VMSCAN_THROTTLE_NOPROGRESS 32 usec_timeout=500000 usect_delayed=156000 reason=VMSCAN_THROTTLE_NOPROGRESS 33 usec_timeout=500000 usect_delayed=224000 reason=VMSCAN_THROTTLE_NOPROGRESS 35 usec_timeout=500000 usect_delayed=128000 reason=VMSCAN_THROTTLE_NOPROGRESS 35 usec_timeout=500000 usect_delayed=176000 reason=VMSCAN_THROTTLE_NOPROGRESS 36 usec_timeout=500000 usect_delayed=368000 reason=VMSCAN_THROTTLE_NOPROGRESS 36 usec_timeout=500000 usect_delayed=496000 reason=VMSCAN_THROTTLE_NOPROGRESS 37 usec_timeout=500000 usect_delayed=312000 reason=VMSCAN_THROTTLE_NOPROGRESS 38 usec_timeout=500000 usect_delayed=304000 reason=VMSCAN_THROTTLE_NOPROGRESS 40 usec_timeout=500000 usect_delayed=288000 reason=VMSCAN_THROTTLE_NOPROGRESS 43 usec_timeout=500000 usect_delayed=408000 reason=VMSCAN_THROTTLE_NOPROGRESS 55 usec_timeout=500000 usect_delayed=416000 reason=VMSCAN_THROTTLE_NOPROGRESS 56 usec_timeout=500000 usect_delayed=76000 reason=VMSCAN_THROTTLE_NOPROGRESS 58 usec_timeout=500000 usect_delayed=120000 reason=VMSCAN_THROTTLE_NOPROGRESS 59 usec_timeout=500000 usect_delayed=208000 reason=VMSCAN_THROTTLE_NOPROGRESS 61 usec_timeout=500000 usect_delayed=68000 reason=VMSCAN_THROTTLE_NOPROGRESS 71 usec_timeout=500000 usect_delayed=192000 reason=VMSCAN_THROTTLE_NOPROGRESS 71 usec_timeout=500000 usect_delayed=480000 reason=VMSCAN_THROTTLE_NOPROGRESS 79 usec_timeout=500000 usect_delayed=60000 reason=VMSCAN_THROTTLE_NOPROGRESS 82 usec_timeout=500000 usect_delayed=320000 reason=VMSCAN_THROTTLE_NOPROGRESS 82 usec_timeout=500000 usect_delayed=92000 reason=VMSCAN_THROTTLE_NOPROGRESS 85 usec_timeout=500000 usect_delayed=64000 reason=VMSCAN_THROTTLE_NOPROGRESS 85 usec_timeout=500000 usect_delayed=80000 reason=VMSCAN_THROTTLE_NOPROGRESS 88 usec_timeout=500000 usect_delayed=84000 reason=VMSCAN_THROTTLE_NOPROGRESS 90 usec_timeout=500000 usect_delayed=160000 reason=VMSCAN_THROTTLE_NOPROGRESS 90 usec_timeout=500000 usect_delayed=292000 reason=VMSCAN_THROTTLE_NOPROGRESS 94 usec_timeout=500000 usect_delayed=56000 reason=VMSCAN_THROTTLE_NOPROGRESS 118 usec_timeout=500000 usect_delayed=88000 reason=VMSCAN_THROTTLE_NOPROGRESS 119 usec_timeout=500000 usect_delayed=72000 reason=VMSCAN_THROTTLE_NOPROGRESS 126 usec_timeout=500000 usect_delayed=108000 reason=VMSCAN_THROTTLE_NOPROGRESS 146 usec_timeout=500000 usect_delayed=52000 reason=VMSCAN_THROTTLE_NOPROGRESS 148 usec_timeout=500000 usect_delayed=36000 reason=VMSCAN_THROTTLE_NOPROGRESS 148 usec_timeout=500000 usect_delayed=48000 reason=VMSCAN_THROTTLE_NOPROGRESS 159 usec_timeout=500000 usect_delayed=28000 reason=VMSCAN_THROTTLE_NOPROGRESS 178 usec_timeout=500000 usect_delayed=44000 reason=VMSCAN_THROTTLE_NOPROGRESS 183 usec_timeout=500000 usect_delayed=40000 reason=VMSCAN_THROTTLE_NOPROGRESS 237 usec_timeout=500000 usect_delayed=100000 reason=VMSCAN_THROTTLE_NOPROGRESS 266 usec_timeout=500000 usect_delayed=32000 reason=VMSCAN_THROTTLE_NOPROGRESS 313 usec_timeout=500000 usect_delayed=24000 reason=VMSCAN_THROTTLE_NOPROGRESS 347 usec_timeout=500000 usect_delayed=96000 reason=VMSCAN_THROTTLE_NOPROGRESS 470 usec_timeout=500000 usect_delayed=20000 reason=VMSCAN_THROTTLE_NOPROGRESS 559 usec_timeout=500000 usect_delayed=16000 reason=VMSCAN_THROTTLE_NOPROGRESS 964 usec_timeout=500000 usect_delayed=12000 reason=VMSCAN_THROTTLE_NOPROGRESS 2001 usec_timeout=500000 usect_delayed=104000 reason=VMSCAN_THROTTLE_NOPROGRESS 2447 usec_timeout=500000 usect_delayed=8000 reason=VMSCAN_THROTTLE_NOPROGRESS 7888 usec_timeout=500000 usect_delayed=4000 reason=VMSCAN_THROTTLE_NOPROGRESS 22727 usec_timeout=500000 usect_delayed=0 reason=VMSCAN_THROTTLE_NOPROGRESS 51305 usec_timeout=500000 usect_delayed=500000 reason=VMSCAN_THROTTLE_NOPROGRESS The full timeout is often hit but a large number also do not stall at all. The remainder slept a little allowing other reclaim tasks to make progress. While this timeout could be further increased, it could also negatively impact worst-case behaviour when there is no prioritisation of what task should make progress. For VMSCAN_THROTTLE_WRITEBACK, the breakdown was 1 usec_timeout=100000 usect_delayed=44000 reason=VMSCAN_THROTTLE_WRITEBACK 2 usec_timeout=100000 usect_delayed=76000 reason=VMSCAN_THROTTLE_WRITEBACK 3 usec_timeout=100000 usect_delayed=80000 reason=VMSCAN_THROTTLE_WRITEBACK 5 usec_timeout=100000 usect_delayed=48000 reason=VMSCAN_THROTTLE_WRITEBACK 5 usec_timeout=100000 usect_delayed=84000 reason=VMSCAN_THROTTLE_WRITEBACK 6 usec_timeout=100000 usect_delayed=72000 reason=VMSCAN_THROTTLE_WRITEBACK 7 usec_timeout=100000 usect_delayed=88000 reason=VMSCAN_THROTTLE_WRITEBACK 11 usec_timeout=100000 usect_delayed=56000 reason=VMSCAN_THROTTLE_WRITEBACK 12 usec_timeout=100000 usect_delayed=64000 reason=VMSCAN_THROTTLE_WRITEBACK 16 usec_timeout=100000 usect_delayed=92000 reason=VMSCAN_THROTTLE_WRITEBACK 24 usec_timeout=100000 usect_delayed=68000 reason=VMSCAN_THROTTLE_WRITEBACK 28 usec_timeout=100000 usect_delayed=32000 reason=VMSCAN_THROTTLE_WRITEBACK 30 usec_timeout=100000 usect_delayed=60000 reason=VMSCAN_THROTTLE_WRITEBACK 30 usec_timeout=100000 usect_delayed=96000 reason=VMSCAN_THROTTLE_WRITEBACK 32 usec_timeout=100000 usect_delayed=52000 reason=VMSCAN_THROTTLE_WRITEBACK 42 usec_timeout=100000 usect_delayed=40000 reason=VMSCAN_THROTTLE_WRITEBACK 77 usec_timeout=100000 usect_delayed=28000 reason=VMSCAN_THROTTLE_WRITEBACK 99 usec_timeout=100000 usect_delayed=36000 reason=VMSCAN_THROTTLE_WRITEBACK 137 usec_timeout=100000 usect_delayed=24000 reason=VMSCAN_THROTTLE_WRITEBACK 190 usec_timeout=100000 usect_delayed=20000 reason=VMSCAN_THROTTLE_WRITEBACK 339 usec_timeout=100000 usect_delayed=16000 reason=VMSCAN_THROTTLE_WRITEBACK 518 usec_timeout=100000 usect_delayed=12000 reason=VMSCAN_THROTTLE_WRITEBACK 852 usec_timeout=100000 usect_delayed=8000 reason=VMSCAN_THROTTLE_WRITEBACK 3359 usec_timeout=100000 usect_delayed=4000 reason=VMSCAN_THROTTLE_WRITEBACK 7147 usec_timeout=100000 usect_delayed=0 reason=VMSCAN_THROTTLE_WRITEBACK 83962 usec_timeout=100000 usect_delayed=100000 reason=VMSCAN_THROTTLE_WRITEBACK The majority hit the timeout in direct reclaim context although a sizable number did not stall at all. This is very different to kswapd where only a tiny percentage of stalls due to writeback reached the timeout. Bottom line, the throttling appears to work and the wakeup events may limit worst case stalls. There might be some grounds for adjusting timeouts but it's likely futile as the worst-case scenarios depend on the workload, memory size and the speed of the storage. A better approach to improve the series further would be to prioritise tasks based on their rate of allocation with the caveat that it may be very expensive to track. This patch (of 5): Page reclaim throttles on wait_iff_congested under the following conditions: - kswapd is encountering pages under writeback and marked for immediate reclaim implying that pages are cycling through the LRU faster than pages can be cleaned. - Direct reclaim will stall if all dirty pages are backed by congested inodes. wait_iff_congested is almost completely broken with few exceptions. This patch adds a new node-based workqueue and tracks the number of throttled tasks and pages written back since throttling started. If enough pages belonging to the node are written back then the throttled tasks will wake early. If not, the throttled tasks sleeps until the timeout expires. [neilb@suse.de: Uninterruptible sleep and simpler wakeups] [hdanton@sina.com: Avoid race when reclaim starts] [vbabka@suse.cz: vmstat irq-safe api, clarifications] Link: https://lore.kernel.org/linux-mm/45d8b7a6-8548-65f5-cccf-9f451d4ae3d4@kernel.dk/ [1] Link: https://lkml.kernel.org/r/20211022144651.19914-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20211022144651.19914-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: NeilBrown Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: "Darrick J . Wong" Cc: Matthew Wilcox Cc: Michal Hocko Cc: Dave Chinner Cc: Rik van Riel Cc: Johannes Weiner Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 1 - include/linux/mmzone.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index ac7f231b8825..9fb1f0ae273c 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -154,7 +154,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) } long congestion_wait(int sync, long timeout); -long wait_iff_congested(int sync, long timeout); static inline bool mapping_can_writeback(struct address_space *mapping) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fb36a29e3aae..5a5e19b90dab 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -199,6 +199,7 @@ enum node_stat_item { NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ + NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ @@ -272,6 +273,11 @@ enum lru_list { NR_LRU_LISTS }; +enum vmscan_throttle_state { + VMSCAN_THROTTLE_WRITEBACK, + NR_VMSCAN_THROTTLE, +}; + #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) @@ -841,6 +847,13 @@ typedef struct pglist_data { int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; + + /* workqueues for throttling reclaim for different reasons. */ + wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; + + atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ + unsigned long nr_reclaim_start; /* nr pages written while throttled + * when throttling started. */ struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ int kswapd_order; -- cgit v1.2.3 From d818fca1cac31b1fc9301bda83e195a46fb4ebaa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Nov 2021 13:42:29 -0700 Subject: mm/vmscan: throttle reclaim and compaction when too may pages are isolated Page reclaim throttles on congestion if too many parallel reclaim instances have isolated too many pages. This makes no sense, excessive parallelisation has nothing to do with writeback or congestion. This patch creates an additional workqueue to sleep on when too many pages are isolated. The throttled tasks are woken when the number of isolated pages is reduced or a timeout occurs. There may be some false positive wakeups for GFP_NOIO/GFP_NOFS callers but the tasks will throttle again if necessary. [shy828301@gmail.com: Wake up from compaction context] [vbabka@suse.cz: Account number of throttled tasks only for writeback] Link: https://lkml.kernel.org/r/20211022144651.19914-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andreas Dilger Cc: "Darrick J . Wong" Cc: Dave Chinner Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: NeilBrown Cc: Rik van Riel Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5a5e19b90dab..312c1ea9aafa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -275,6 +275,7 @@ enum lru_list { enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, + VMSCAN_THROTTLE_ISOLATED, NR_VMSCAN_THROTTLE, }; -- cgit v1.2.3 From 69392a403f49e6e33f9dfb1d6edb87c8006f83c2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Nov 2021 13:42:32 -0700 Subject: mm/vmscan: throttle reclaim when no progress is being made Memcg reclaim throttles on congestion if no reclaim progress is made. This makes little sense, it might be due to writeback or a host of other factors. For !memcg reclaim, it's messy. Direct reclaim primarily is throttled in the page allocator if it is failing to make progress. Kswapd throttles if too many pages are under writeback and marked for immediate reclaim. This patch explicitly throttles if reclaim is failing to make progress. [vbabka@suse.cz: Remove redundant code] Link: https://lkml.kernel.org/r/20211022144651.19914-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andreas Dilger Cc: "Darrick J . Wong" Cc: Dave Chinner Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: NeilBrown Cc: Rik van Riel Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 312c1ea9aafa..58e744b78c2c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -276,6 +276,7 @@ enum lru_list { enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, VMSCAN_THROTTLE_ISOLATED, + VMSCAN_THROTTLE_NOPROGRESS, NR_VMSCAN_THROTTLE, }; -- cgit v1.2.3 From 7e6ec49c18988f1b8dab0677271dafde5f8d9a43 Mon Sep 17 00:00:00 2001 From: Yuanzheng Song Date: Fri, 5 Nov 2021 13:42:52 -0700 Subject: mm/vmpressure: fix data-race with memcg->socket_pressure When reading memcg->socket_pressure in mem_cgroup_under_socket_pressure() and writing memcg->socket_pressure in vmpressure() at the same time, the following data-race occurs: BUG: KCSAN: data-race in __sk_mem_reduce_allocated / vmpressure write to 0xffff8881286f4938 of 8 bytes by task 24550 on cpu 3: vmpressure+0x218/0x230 mm/vmpressure.c:307 shrink_node_memcgs+0x2b9/0x410 mm/vmscan.c:2658 shrink_node+0x9d2/0x11d0 mm/vmscan.c:2769 shrink_zones+0x29f/0x470 mm/vmscan.c:2972 do_try_to_free_pages+0x193/0x6e0 mm/vmscan.c:3027 try_to_free_mem_cgroup_pages+0x1c0/0x3f0 mm/vmscan.c:3345 reclaim_high mm/memcontrol.c:2440 [inline] mem_cgroup_handle_over_high+0x18b/0x4d0 mm/memcontrol.c:2624 tracehook_notify_resume include/linux/tracehook.h:197 [inline] exit_to_user_mode_loop kernel/entry/common.c:164 [inline] exit_to_user_mode_prepare+0x110/0x170 kernel/entry/common.c:191 syscall_exit_to_user_mode+0x16/0x30 kernel/entry/common.c:266 ret_from_fork+0x15/0x30 arch/x86/entry/entry_64.S:289 read to 0xffff8881286f4938 of 8 bytes by interrupt on cpu 1: mem_cgroup_under_socket_pressure include/linux/memcontrol.h:1483 [inline] sk_under_memory_pressure include/net/sock.h:1314 [inline] __sk_mem_reduce_allocated+0x1d2/0x270 net/core/sock.c:2696 __sk_mem_reclaim+0x44/0x50 net/core/sock.c:2711 sk_mem_reclaim include/net/sock.h:1490 [inline] ...... net_rx_action+0x17a/0x480 net/core/dev.c:6864 __do_softirq+0x12c/0x2af kernel/softirq.c:298 run_ksoftirqd+0x13/0x20 kernel/softirq.c:653 smpboot_thread_fn+0x33f/0x510 kernel/smpboot.c:165 kthread+0x1fc/0x220 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296 Fix it by using READ_ONCE() and WRITE_ONCE() to read and write memcg->socket_pressure. Link: https://lkml.kernel.org/r/20211025082843.671690-1-songyuanzheng@huawei.com Signed-off-by: Yuanzheng Song Reviewed-by: Muchun Song Cc: Shakeel Butt Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Matthew Wilcox (Oracle) Cc: Alex Shi Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f207f98bdb76..9d96238d9c21 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1606,7 +1606,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure) return true; do { - if (time_before(jiffies, memcg->socket_pressure)) + if (time_before(jiffies, READ_ONCE(memcg->socket_pressure))) return true; } while ((memcg = parent_mem_cgroup(memcg))); return false; -- cgit v1.2.3 From fa27717110ae51b9b9013ced0b5143888257bb79 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 5 Nov 2021 13:43:13 -0700 Subject: memblock: drop memblock_free_early_nid() and memblock_free_early() memblock_free_early_nid() is unused and memblock_free_early() is an alias for memblock_free(). Replace calls to memblock_free_early() with calls to memblock_free() and remove memblock_free_early() and memblock_free_early_nid(). Link: https://lkml.kernel.org/r/20210930185031.18648-4-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Christophe Leroy Cc: Juergen Gross Cc: Shahab Vahedi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 34de69b3b8ba..fc8183be340c 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -441,18 +441,6 @@ static inline void *memblock_alloc_node(phys_addr_t size, MEMBLOCK_ALLOC_ACCESSIBLE, nid); } -static inline void memblock_free_early(phys_addr_t base, - phys_addr_t size) -{ - memblock_free(base, size); -} - -static inline void memblock_free_early_nid(phys_addr_t base, - phys_addr_t size, int nid) -{ - memblock_free(base, size); -} - static inline void memblock_free_late(phys_addr_t base, phys_addr_t size) { __memblock_free_late(base, size); -- cgit v1.2.3 From 621d973901cf9fa6c6e31b31bdd36c5c5f3c9c9e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 5 Nov 2021 13:43:16 -0700 Subject: memblock: stop aliasing __memblock_free_late with memblock_free_late memblock_free_late() is a NOP wrapper for __memblock_free_late(), there is no point to keep this indirection. Drop the wrapper and rename __memblock_free_late() to memblock_free_late(). Link: https://lkml.kernel.org/r/20210930185031.18648-5-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Christophe Leroy Cc: Juergen Gross Cc: Shahab Vahedi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index fc8183be340c..e25f964fdd60 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -133,7 +133,7 @@ void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); -void __memblock_free_late(phys_addr_t base, phys_addr_t size); +void memblock_free_late(phys_addr_t base, phys_addr_t size); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, @@ -441,11 +441,6 @@ static inline void *memblock_alloc_node(phys_addr_t size, MEMBLOCK_ALLOC_ACCESSIBLE, nid); } -static inline void memblock_free_late(phys_addr_t base, phys_addr_t size) -{ - __memblock_free_late(base, size); -} - /* * Set the allocation direction to bottom-up or top-down. */ -- cgit v1.2.3 From 3ecc68349bbab6bff1d12cbc7951ca6019b2faf6 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 5 Nov 2021 13:43:19 -0700 Subject: memblock: rename memblock_free to memblock_phys_free Since memblock_free() operates on a physical range, make its name reflect it and rename it to memblock_phys_free(), so it will be a logical counterpart to memblock_phys_alloc(). The callers are updated with the below semantic patch: @@ expression addr; expression size; @@ - memblock_free(addr, size); + memblock_phys_free(addr, size); Link: https://lkml.kernel.org/r/20210930185031.18648-6-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Christophe Leroy Cc: Juergen Gross Cc: Shahab Vahedi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e25f964fdd60..d32d41709513 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -103,7 +103,7 @@ void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_remove(phys_addr_t base, phys_addr_t size); -int memblock_free(phys_addr_t base, phys_addr_t size); +int memblock_phys_free(phys_addr_t base, phys_addr_t size); int memblock_reserve(phys_addr_t base, phys_addr_t size); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP int memblock_physmem_add(phys_addr_t base, phys_addr_t size); -- cgit v1.2.3 From 4421cca0a3e4833b3bf0f20de98eb580ab8c7290 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 5 Nov 2021 13:43:22 -0700 Subject: memblock: use memblock_free for freeing virtual pointers Rename memblock_free_ptr() to memblock_free() and use memblock_free() when freeing a virtual pointer so that memblock_free() will be a counterpart of memblock_alloc() The callers are updated with the below semantic patch and manual addition of (void *) casting to pointers that are represented by unsigned long variables. @@ identifier vaddr; expression size; @@ ( - memblock_phys_free(__pa(vaddr), size); + memblock_free(vaddr, size); | - memblock_free_ptr(vaddr, size); + memblock_free(vaddr, size); ) [sfr@canb.auug.org.au: fixup] Link: https://lkml.kernel.org/r/20211018192940.3d1d532f@canb.auug.org.au Link: https://lkml.kernel.org/r/20210930185031.18648-7-rppt@kernel.org Signed-off-by: Mike Rapoport Signed-off-by: Stephen Rothwell Cc: Christophe Leroy Cc: Juergen Gross Cc: Shahab Vahedi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index d32d41709513..484650681bee 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -118,7 +118,7 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); void memblock_free_all(void); -void memblock_free_ptr(void *ptr, size_t size); +void memblock_free(void *ptr, size_t size); void reset_node_managed_pages(pg_data_t *pgdat); void reset_all_zones_managed_pages(void); -- cgit v1.2.3 From b5389086ad7be0453c55e0069a89856d1fbdf605 Mon Sep 17 00:00:00 2001 From: Zhenguo Yao Date: Fri, 5 Nov 2021 13:43:28 -0700 Subject: hugetlbfs: extend the definition of hugepages parameter to support node allocation We can specify the number of hugepages to allocate at boot. But the hugepages is balanced in all nodes at present. In some scenarios, we only need hugepages in one node. For example: DPDK needs hugepages which are in the same node as NIC. If DPDK needs four hugepages of 1G size in node1 and system has 16 numa nodes we must reserve 64 hugepages on the kernel cmdline. But only four hugepages are used. The others should be free after boot. If the system memory is low(for example: 64G), it will be an impossible task. So extend the hugepages parameter to support specifying hugepages on a specific node. For example add following parameter: hugepagesz=1G hugepages=0:1,1:3 It will allocate 1 hugepage in node0 and 3 hugepages in node1. Link: https://lkml.kernel.org/r/20211005054729.86457-1-yaozhenguo1@gmail.com Signed-off-by: Zhenguo Yao Reviewed-by: Mike Kravetz Cc: Zhenguo Yao Cc: Dan Carpenter Cc: Nathan Chancellor Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jonathan Corbet Cc: Mike Rapoport Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c0a20781b28e..44c2ab0dfa59 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -615,6 +615,7 @@ struct hstate { unsigned long nr_overcommit_huge_pages; struct list_head hugepage_activelist; struct list_head hugepage_freelists[MAX_NUMNODES]; + unsigned int max_huge_pages_node[MAX_NUMNODES]; unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; @@ -647,8 +648,9 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page); /* arch callback */ -int __init __alloc_bootmem_huge_page(struct hstate *h); -int __init alloc_bootmem_huge_page(struct hstate *h); +int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); +int __init alloc_bootmem_huge_page(struct hstate *h, int nid); +bool __init hugetlb_node_alloc_supported(void); void __init hugetlb_add_hstate(unsigned order); bool __init arch_hugetlb_valid_size(unsigned long size); -- cgit v1.2.3 From 8eb42beac8d369f5443addd469879d152422a28d Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 5 Nov 2021 13:43:32 -0700 Subject: mm/migrate: de-duplicate migrate_reason strings In order to remove the need to manually keep three different files in synch, provide a common definition of the mapping between enum migrate_reason, and the associated strings for each enum item. 1. Use the tracing system's mapping of enums to strings, by redefining and reusing the MIGRATE_REASON and supporting macros, and using that to populate the string array in mm/debug.c. 2. Move enum migrate_reason to migrate_mode.h. This is not strictly necessary for this patch, but migrate mode and migrate reason go together, so this will slightly clarify things. Link: https://lkml.kernel.org/r/20210922041755.141817-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Weizhao Ouyang Cc: "Huang, Ying" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 19 +------------------ include/linux/migrate_mode.h | 13 +++++++++++++ 2 files changed, 14 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index c8077e936691..3d154fe03c96 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -19,24 +19,7 @@ struct migration_target_control; */ #define MIGRATEPAGE_SUCCESS 0 -/* - * Keep sync with: - * - macro MIGRATE_REASON in include/trace/events/migrate.h - * - migrate_reason_names[MR_TYPES] in mm/debug.c - */ -enum migrate_reason { - MR_COMPACTION, - MR_MEMORY_FAILURE, - MR_MEMORY_HOTPLUG, - MR_SYSCALL, /* also applies to cpusets */ - MR_MEMPOLICY_MBIND, - MR_NUMA_MISPLACED, - MR_CONTIG_RANGE, - MR_LONGTERM_PIN, - MR_DEMOTION, - MR_TYPES -}; - +/* Defined in mm/debug.c: */ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index 883c99249033..f37cc03f9369 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -19,4 +19,17 @@ enum migrate_mode { MIGRATE_SYNC_NO_COPY, }; +enum migrate_reason { + MR_COMPACTION, + MR_MEMORY_FAILURE, + MR_MEMORY_HOTPLUG, + MR_SYSCALL, /* also applies to cpusets */ + MR_MEMPOLICY_MBIND, + MR_NUMA_MISPLACED, + MR_CONTIG_RANGE, + MR_LONGTERM_PIN, + MR_DEMOTION, + MR_TYPES +}; + #endif /* MIGRATE_MODE_H_INCLUDED */ -- cgit v1.2.3 From 20f9ba4f995247bb79e243741b8fdddbd76dd923 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 5 Nov 2021 13:43:35 -0700 Subject: mm: migrate: make demotion knob depend on migration The memory demotion needs to call migrate_pages() to do the jobs. And it is controlled by a knob, however, the knob doesn't depend on CONFIG_MIGRATION. The knob could be truned on even though MIGRATION is disabled, this will not cause any crash since migrate_pages() would just return -ENOSYS. But it is definitely not optimal to go through demotion path then retry regular swap every time. And it doesn't make too much sense to have the knob visible to the users when !MIGRATION. Move the related code from mempolicy.[h|c] to migrate.[h|c]. Link: https://lkml.kernel.org/r/20211015005559.246709-1-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: "Huang, Ying" Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 4 ---- include/linux/migrate.h | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 097bbbb37493..3c7595e81150 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -183,8 +183,6 @@ extern bool vma_migratable(struct vm_area_struct *vma); extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); -extern bool numa_demotion_enabled; - static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return (pol->mode == MPOL_PREFERRED_MANY); @@ -300,8 +298,6 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) return NULL; } -#define numa_demotion_enabled false - static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3d154fe03c96..2d8130e05dc0 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -40,6 +40,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); + +extern bool numa_demotion_enabled; #else static inline void putback_movable_pages(struct list_head *l) {} @@ -65,6 +67,8 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, { return -ENOSYS; } + +#define numa_demotion_enabled false #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_COMPACTION -- cgit v1.2.3 From 50f9481ed9fb8a2d2a06a155634c7f9eeff9fa61 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:24 -0700 Subject: mm/memory_hotplug: remove CONFIG_MEMORY_HOTPLUG_SPARSE CONFIG_MEMORY_HOTPLUG depends on CONFIG_SPARSEMEM, so there is no need for CONFIG_MEMORY_HOTPLUG_SPARSE anymore; adjust all instances to use CONFIG_MEMORY_HOTPLUG and remove CONFIG_MEMORY_HOTPLUG_SPARSE. Link: https://lkml.kernel.org/r/20210929143600.49379-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Shuah Khan [kselftest] Acked-by: Greg Kroah-Hartman Acked-by: Oscar Salvador Cc: Alex Shi Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Jonathan Corbet Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 24 ++++++++++-------------- include/linux/node.h | 4 ++-- 2 files changed, 12 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 053a530c7bdd..0328ec039c38 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -110,7 +110,7 @@ struct mem_section; #define SLAB_CALLBACK_PRI 1 #define IPC_CALLBACK_PRI 10 -#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifndef CONFIG_MEMORY_HOTPLUG static inline void memory_dev_init(void) { return; @@ -126,7 +126,14 @@ static inline int memory_notify(unsigned long val, void *v) { return 0; } -#else +static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) +{ + return 0; +} +/* These aren't inline functions due to a GCC bug. */ +#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) +#define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) +#else /* CONFIG_MEMORY_HOTPLUG */ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, @@ -148,9 +155,6 @@ struct memory_group *memory_group_find_by_id(int mgid); typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *); int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, struct memory_group *excluded, void *arg); -#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ - -#ifdef CONFIG_MEMORY_HOTPLUG #define hotplug_memory_notifier(fn, pri) ({ \ static __meminitdata struct notifier_block fn##_mem_nb =\ { .notifier_call = fn, .priority = pri };\ @@ -158,15 +162,7 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, }) #define register_hotmemory_notifier(nb) register_memory_notifier(nb) #define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb) -#else -static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) -{ - return 0; -} -/* These aren't inline functions due to a GCC bug. */ -#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) -#define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) -#endif +#endif /* CONFIG_MEMORY_HOTPLUG */ /* * Kernel text modification mutex, used for code patching. Users of this lock diff --git a/include/linux/node.h b/include/linux/node.h index 8e5a29897936..bb21fd631b16 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -85,7 +85,7 @@ struct node { struct device dev; struct list_head access_list; -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS) +#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) struct work_struct node_work; #endif #ifdef CONFIG_HMEM_REPORTING @@ -98,7 +98,7 @@ struct memory_block; extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) +#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA) void link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, enum meminit_context context); -- cgit v1.2.3 From 6b740c6c3aa371cd70ac07f8d071f2a8af28c51c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:31 -0700 Subject: mm/memory_hotplug: remove HIGHMEM leftovers We don't support CONFIG_MEMORY_HOTPLUG on 32 bit and consequently not HIGHMEM. Let's remove any leftover code -- including the unused "status_change_nid_high" field part of the memory notifier. Link: https://lkml.kernel.org/r/20210929143600.49379-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alex Shi Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Jonathan Corbet Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 0328ec039c38..88eb587b5143 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -96,7 +96,6 @@ struct memory_notify { unsigned long start_pfn; unsigned long nr_pages; int status_change_nid_normal; - int status_change_nid_high; int status_change_nid; }; -- cgit v1.2.3 From 43e3aa2a3247c9ca348884866a9846d788ec5ed6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:35 -0700 Subject: mm/memory_hotplug: remove stale function declarations These functions no longer exist. Link: https://lkml.kernel.org/r/20210929143600.49379-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Alex Shi Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dave Hansen Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Jonathan Corbet Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Shuah Khan Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e5a867c950b2..be48e003a518 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -98,9 +98,6 @@ static inline void zone_seqlock_init(struct zone *zone) { seqlock_init(&zone->span_seqlock); } -extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); -extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); -extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); extern void adjust_present_page_count(struct page *page, struct memory_group *group, long nr_pages); -- cgit v1.2.3 From e14b41556d9ec20af60a9f672ac6f64dd450763c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:46 -0700 Subject: memblock: improve MEMBLOCK_HOTPLUG documentation The description of MEMBLOCK_HOTPLUG is currently short and consequently misleading: we're actually dealing with a memory region that might get hotunplugged later (i.e., the platform+firmware supports it), yet it is indicated in the firmware-provided memory map as system ram that will just get used by the system for any purpose when not taking special care. The firmware marked this memory region as a hot(un)plugged (e.g., hotplugged before reboot), implying that it might get hotunplugged again later. Whether we consider this information depends on the "movable_node" kernel commandline parameter: only with "movable_node" set, we'll try keeping this memory hotunpluggable, for example, by not serving early allocations from this memory region and by letting the buddy manage it using the ZONE_MOVABLE. Let's make this clearer by extending the documentation. Note: kexec *has to* indicate this memory to the second kernel. With "movable_node" set, we don't want to place kexec-images on this memory. Without "movable_node" set, we don't care and can place kexec-images on this memory. In both cases, after successful memory hotunplug, kexec has to be re-armed to update the memory map for the second kernel and to place the kexec-images somewhere else. Link: https://lkml.kernel.org/r/20211004093605.5830-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport Cc: "Aneesh Kumar K . V" Cc: Arnd Bergmann Cc: Christian Borntraeger Cc: Eric Biederman Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Huacai Chen Cc: Jianyong Wu Cc: Jiaxun Yang Cc: Michal Hocko Cc: Oscar Salvador Cc: Shahab Vahedi Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 484650681bee..1d2b3e902a84 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -28,7 +28,11 @@ extern unsigned long long max_possible_pfn; /** * enum memblock_flags - definition of memory region attributes * @MEMBLOCK_NONE: no special request - * @MEMBLOCK_HOTPLUG: hotpluggable region + * @MEMBLOCK_HOTPLUG: memory region indicated in the firmware-provided memory + * map during early boot as hot(un)pluggable system RAM (e.g., memory range + * that might get hotunplugged later). With "movable_node" set on the kernel + * commandline, try keeping this memory region hotunpluggable. Does not apply + * to memblocks added ("hotplugged") after early boot. * @MEMBLOCK_MIRROR: mirrored region * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as * reserved in the memory map; refer to memblock_mark_nomap() description -- cgit v1.2.3 From 952eea9b01e4bbb7011329f1b7240844e61e5128 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:49 -0700 Subject: memblock: allow to specify flags with memblock_add_node() We want to specify flags when hotplugging memory. Let's prepare to pass flags to memblock_add_node() by adjusting all existing users. Note that when hotplugging memory the system is already up and running and we might have concurrent memblock users: for example, while we're hotplugging memory, kexec_file code might search for suitable memory regions to place kexec images. It's important to add the memory directly to memblock via a single call with the right flags, instead of adding the memory first and apply flags later: otherwise, concurrent memblock users might temporarily stumble over memblocks with wrong flags, which will be important in a follow-up patch that introduces a new flag to properly handle add_memory_driver_managed(). Link: https://lkml.kernel.org/r/20211004093605.5830-4-david@redhat.com Acked-by: Geert Uytterhoeven Acked-by: Heiko Carstens Signed-off-by: David Hildenbrand Acked-by: Shahab Vahedi [arch/arc] Reviewed-by: Mike Rapoport Cc: "Aneesh Kumar K . V" Cc: Arnd Bergmann Cc: Christian Borntraeger Cc: Eric Biederman Cc: Huacai Chen Cc: Jianyong Wu Cc: Jiaxun Yang Cc: Michal Hocko Cc: Oscar Salvador Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 3 ++- include/linux/mm.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 1d2b3e902a84..8b6560dc7a9e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -104,7 +104,8 @@ static inline void memblock_discard(void) {} #endif void memblock_allow_resize(void); -int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); +int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid, + enum memblock_flags flags); int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_remove(phys_addr_t base, phys_addr_t size); int memblock_phys_free(phys_addr_t base, phys_addr_t size); diff --git a/include/linux/mm.h b/include/linux/mm.h index 7e37c726b9db..15058d9cec99 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2425,7 +2425,7 @@ static inline unsigned long get_num_physpages(void) * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() - * memblock_add_node(base, size, nid) + * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); -- cgit v1.2.3 From f7892d8e288d4b090176f26d9bf7943dbbb639a6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Nov 2021 13:44:53 -0700 Subject: memblock: add MEMBLOCK_DRIVER_MANAGED to mimic IORESOURCE_SYSRAM_DRIVER_MANAGED Let's add a flag that corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED, indicating that we're dealing with a memory region that is never indicated in the firmware-provided memory map, but always detected and added by a driver. Similar to MEMBLOCK_HOTPLUG, most infrastructure has to treat such memory regions like ordinary MEMBLOCK_NONE memory regions -- for example, when selecting memory regions to add to the vmcore for dumping in the crashkernel via for_each_mem_range(). However, especially kexec_file is not supposed to select such memblocks via for_each_free_mem_range() / for_each_free_mem_range_reverse() to place kexec images, similar to how we handle IORESOURCE_SYSRAM_DRIVER_MANAGED without CONFIG_ARCH_KEEP_MEMBLOCK. We'll make sure that memory hotplug code sets the flag where applicable (IORESOURCE_SYSRAM_DRIVER_MANAGED) next. This prepares architectures that need CONFIG_ARCH_KEEP_MEMBLOCK, such as arm64, for virtio-mem support. Note that kexec *must not* indicate this memory to the second kernel and *must not* place kexec-images on this memory. Let's add a comment to kexec_walk_memblock(), documenting how we handle MEMBLOCK_DRIVER_MANAGED now just like using IORESOURCE_SYSRAM_DRIVER_MANAGED in locate_mem_hole_callback() for kexec_walk_resources(). Also note that MEMBLOCK_HOTPLUG cannot be reused due to different semantics: MEMBLOCK_HOTPLUG: memory is indicated as "System RAM" in the firmware-provided memory map and added to the system early during boot; kexec *has to* indicate this memory to the second kernel and can place kexec-images on this memory. After memory hotunplug, kexec has to be re-armed. We mostly ignore this flag when "movable_node" is not set on the kernel command line, because then we're told to not care about hotunpluggability of such memory regions. MEMBLOCK_DRIVER_MANAGED: memory is not indicated as "System RAM" in the firmware-provided memory map; this memory is always detected and added to the system by a driver; memory might not actually be physically hotunpluggable. kexec *must not* indicate this memory to the second kernel and *must not* place kexec-images on this memory. Link: https://lkml.kernel.org/r/20211004093605.5830-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport Cc: "Aneesh Kumar K . V" Cc: Arnd Bergmann Cc: Christian Borntraeger Cc: Eric Biederman Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Huacai Chen Cc: Jianyong Wu Cc: Jiaxun Yang Cc: Michal Hocko Cc: Oscar Salvador Cc: Shahab Vahedi Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 8b6560dc7a9e..7df557b16c1e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -37,12 +37,17 @@ extern unsigned long long max_possible_pfn; * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as * reserved in the memory map; refer to memblock_mark_nomap() description * for further details + * @MEMBLOCK_DRIVER_MANAGED: memory region that is always detected and added + * via a driver, and never indicated in the firmware-provided memory map as + * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the + * kernel resource tree. */ enum memblock_flags { MEMBLOCK_NONE = 0x0, /* No special request */ MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */ MEMBLOCK_MIRROR = 0x2, /* mirrored region */ MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ + MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */ }; /** @@ -213,7 +218,8 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, */ #define for_each_mem_range(i, p_start, p_end) \ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, \ - MEMBLOCK_HOTPLUG, p_start, p_end, NULL) + MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED, \ + p_start, p_end, NULL) /** * for_each_mem_range_rev - reverse iterate through memblock areas from @@ -224,7 +230,8 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, */ #define for_each_mem_range_rev(i, p_start, p_end) \ __for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \ - MEMBLOCK_HOTPLUG, p_start, p_end, NULL) + MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED,\ + p_start, p_end, NULL) /** * for_each_reserved_mem_range - iterate over all reserved memblock areas @@ -254,6 +261,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m) return m->flags & MEMBLOCK_NOMAP; } +static inline bool memblock_is_driver_managed(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_DRIVER_MANAGED; +} + int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, -- cgit v1.2.3 From d2c20e51e3966bc668ef1ef21fbe90704286c8d0 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 5 Nov 2021 13:45:06 -0700 Subject: mm/highmem: remove deprecated kmap_atomic kmap_atomic() is being deprecated in favor of kmap_local_page(). Replace the uses of kmap_atomic() within the highmem code. On profiling clear_huge_page() using ftrace an improvement of 62% was observed on the below setup. Setup:- Below data has been collected on Qualcomm's SM7250 SoC THP enabled (kernel v4.19.113) with only CPU-0(Cortex-A55) and CPU-7(Cortex-A76) switched on and set to max frequency, also DDR set to perf governor. FTRACE Data:- Base data:- Number of iterations: 48 Mean of allocation time: 349.5 us std deviation: 74.5 us v4 data:- Number of iterations: 48 Mean of allocation time: 131 us std deviation: 32.7 us The following simple userspace experiment to allocate 100MB(BUF_SZ) of pages and writing to it gave us a good insight, we observed an improvement of 42% in allocation and writing timings. ------------------------------------------------------------- Test code snippet ------------------------------------------------------------- clock_start(); buf = malloc(BUF_SZ); /* Allocate 100 MB of memory */ for(i=0; i < BUF_SZ_PAGES; i++) { *((int *)(buf + (i*PAGE_SIZE))) = 1; } clock_end(); ------------------------------------------------------------- Malloc test timings for 100MB anon allocation:- Base data:- Number of iterations: 100 Mean of allocation time: 31831 us std deviation: 4286 us v4 data:- Number of iterations: 100 Mean of allocation time: 18193 us std deviation: 4915 us [willy@infradead.org: fix zero_user_segments()] Link: https://lkml.kernel.org/r/YYVhHCJcm2DM2G9u@casper.infradead.org Link: https://lkml.kernel.org/r/20210204073255.20769-2-prathu.baronia@oneplus.com Signed-off-by: Ira Weiny Signed-off-by: Prathu Baronia Cc: Thomas Gleixner Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index b4c49f9cc379..31ebee36f26c 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -143,9 +143,9 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { - void *addr = kmap_atomic(page); + void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); - kunmap_atomic(addr); + kunmap_local(addr); } #endif @@ -177,9 +177,9 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, static inline void clear_highpage(struct page *page) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); clear_page(kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE @@ -202,7 +202,7 @@ static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); @@ -213,7 +213,7 @@ static inline void zero_user_segments(struct page *page, if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); - kunmap_atomic(kaddr); + kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } @@ -238,11 +238,11 @@ static inline void copy_user_highpage(struct page *to, struct page *from, { char *vfrom, *vto; - vfrom = kmap_atomic(from); - vto = kmap_atomic(to); + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); - kunmap_atomic(vto); - kunmap_atomic(vfrom); + kunmap_local(vto); + kunmap_local(vfrom); } #endif @@ -253,11 +253,11 @@ static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; - vfrom = kmap_atomic(from); - vto = kmap_atomic(to); + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); copy_page(vto, vfrom); - kunmap_atomic(vto); - kunmap_atomic(vfrom); + kunmap_local(vto); + kunmap_local(vfrom); } #endif -- cgit v1.2.3 From 53944f171a89dff4e2a3d76f42e6eedb551bb861 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 5 Nov 2021 13:45:18 -0700 Subject: mm: remove HARDENED_USERCOPY_FALLBACK This has served its purpose and is no longer used. All usercopy violations appear to have been handled by now, any remaining instances (or new bugs) will cause copies to be rejected. This isn't a direct revert of commit 2d891fbc3bb6 ("usercopy: Allow strict enforcement of whitelists"); since usercopy_fallback is effectively 0, the fallback handling is removed too. This also removes the usercopy_fallback module parameter on slab_common. Link: https://github.com/KSPP/linux/issues/153 Link: https://lkml.kernel.org/r/20210921061149.1091163-1-steve@sk2.org Signed-off-by: Stephen Kitt Suggested-by: Kees Cook Acked-by: Kees Cook Reviewed-by: Joel Stanley [defconfig change] Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: James Morris Cc: "Serge E . Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 837cb16232ef..181045148b06 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -142,8 +142,6 @@ struct mem_cgroup; void __init kmem_cache_init(void); bool slab_is_available(void); -extern bool usercopy_fallback; - struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); -- cgit v1.2.3 From a1554c002699cbc9ced2e9f44f9c1357181bead3 Mon Sep 17 00:00:00 2001 From: Mianhan Liu Date: Fri, 5 Nov 2021 13:45:21 -0700 Subject: include/linux/mm.h: move nr_free_buffer_pages from swap.h to mm.h nr_free_buffer_pages could be exposed through mm.h instead of swap.h. The advantage of this change is that it can reduce the obsolete includes. For example, net/ipv4/tcp.c wouldn't need swap.h any more since it has already included mm.h. Similarly, after checking all the other files, it comes that tcp.c, udp.c meter.c ,... follow the same rule, so these files can have swap.h removed too. Moreover, after preprocessing all the files that use nr_free_buffer_pages, it turns out that those files have already included mm.h.Thus, we can move nr_free_buffer_pages from swap.h to mm.h safely. This change will not affect the compilation of other files. Link: https://lkml.kernel.org/r/20210912133640.1624-1-liumh1@shanghaitech.edu.cn Signed-off-by: Mianhan Liu Cc: Jakub Kicinski CC: Ulf Hansson Cc: "David S . Miller" Cc: Simon Horman Cc: Pravin B Shelar Cc: Vlad Yasevich Cc: Marcelo Ricardo Leitner Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ include/linux/swap.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 15058d9cec99..b1720aa63727 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -875,6 +875,8 @@ void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); void copy_huge_page(struct page *dst, struct page *src); +unsigned long nr_free_buffer_pages(void); + /* * Compound pages have a destructor function. Provide a * prototype for that function and accessor functions. diff --git a/include/linux/swap.h b/include/linux/swap.h index ba52f3a3478e..53f759a59996 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -335,7 +335,6 @@ void workingset_update_node(struct xa_node *node); /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; -extern unsigned long nr_free_buffer_pages(void); /* Definition of global_zone_page_state not available yet */ #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) -- cgit v1.2.3 From f39f21b3ddc7fc0f87eb6dc75ddc81b5bbfb7672 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:25 -0700 Subject: stacktrace: move filter_irq_stacks() to kernel/stacktrace.c filter_irq_stacks() has little to do with the stackdepot implementation, except that it is usually used by users (such as KASAN) of stackdepot to reduce the stack trace. However, filter_irq_stacks() itself is not useful without a stack trace as obtained by stack_trace_save() and friends. Therefore, move filter_irq_stacks() to kernel/stacktrace.c, so that new users of filter_irq_stacks() do not have to start depending on STACKDEPOT only for filter_irq_stacks(). Link: https://lkml.kernel.org/r/20210923104803.2620285-1-elver@google.com Signed-off-by: Marco Elver Acked-by: Dmitry Vyukov Cc: Alexander Potapenko Cc: Jann Horn Cc: Aleksandr Nogikh Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackdepot.h | 2 -- include/linux/stacktrace.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index b2f7e7c6ba54..d29860966bc9 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -25,8 +25,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); -unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); - #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); #else diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 9edecb494e9e..bef158815e83 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -21,6 +21,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, unsigned int size, unsigned int skipnr); unsigned int stack_trace_save_user(unsigned long *store, unsigned int size); +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); /* Internal interfaces. Do not use in generic code */ #ifdef CONFIG_ARCH_STACKWALK -- cgit v1.2.3 From 07e8481d3c38f461d7b79c1d5c9afe013b162b0c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:46 -0700 Subject: kfence: always use static branches to guard kfence_alloc() Regardless of KFENCE mode (CONFIG_KFENCE_STATIC_KEYS: either using static keys to gate allocations, or using a simple dynamic branch), always use a static branch to avoid the dynamic branch in kfence_alloc() if KFENCE was disabled at boot. For CONFIG_KFENCE_STATIC_KEYS=n, this now avoids the dynamic branch if KFENCE was disabled at boot. To simplify, also unifies the location where kfence_allocation_gate is read-checked to just be inline in kfence_alloc(). Link: https://lkml.kernel.org/r/20211019102524.2807208-1-elver@google.com Signed-off-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfence.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 3fe6dd8a18c1..4b5e3679a72c 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -14,6 +14,9 @@ #ifdef CONFIG_KFENCE +#include +#include + /* * We allocate an even number of pages, as it simplifies calculations to map * address to metadata indices; effectively, the very first page serves as an @@ -22,13 +25,8 @@ #define KFENCE_POOL_SIZE ((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE) extern char *__kfence_pool; -#ifdef CONFIG_KFENCE_STATIC_KEYS -#include DECLARE_STATIC_KEY_FALSE(kfence_allocation_key); -#else -#include extern atomic_t kfence_allocation_gate; -#endif /** * is_kfence_address() - check if an address belongs to KFENCE pool @@ -116,13 +114,16 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags); */ static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { -#ifdef CONFIG_KFENCE_STATIC_KEYS - if (static_branch_unlikely(&kfence_allocation_key)) +#if defined(CONFIG_KFENCE_STATIC_KEYS) || CONFIG_KFENCE_SAMPLE_INTERVAL == 0 + if (!static_branch_unlikely(&kfence_allocation_key)) + return NULL; #else - if (unlikely(!atomic_read(&kfence_allocation_gate))) + if (!static_branch_likely(&kfence_allocation_key)) + return NULL; #endif - return __kfence_alloc(s, size, flags); - return NULL; + if (likely(atomic_read(&kfence_allocation_gate))) + return NULL; + return __kfence_alloc(s, size, flags); } /** -- cgit v1.2.3 From d2f272b35a84ace2ef04334a9822fd726a7f061b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:04 -0700 Subject: include/linux/damon.h: fix kernel-doc comments for 'damon_callback' A few Kernel-doc comments in 'damon.h' are broken. This fixes them. Link: https://lkml.kernel.org/r/20210917123958.3819-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index d68b67b8d458..755d70804705 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -62,7 +62,7 @@ struct damon_target { struct damon_ctx; /** - * struct damon_primitive Monitoring primitives for given use cases. + * struct damon_primitive - Monitoring primitives for given use cases. * * @init: Initialize primitive-internal data structures. * @update: Update primitive-internal data structures. @@ -108,8 +108,8 @@ struct damon_primitive { void (*cleanup)(struct damon_ctx *context); }; -/* - * struct damon_callback Monitoring events notification callbacks. +/** + * struct damon_callback - Monitoring events notification callbacks. * * @before_start: Called before starting the monitoring. * @after_sampling: Called after each sampling. -- cgit v1.2.3 From fda504fade7f124858d7022341dc46ff35b45274 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:18 -0700 Subject: mm/damon/core: account age of target regions Patch series "Implement Data Access Monitoring-based Memory Operation Schemes". Introduction ============ DAMON[1] can be used as a primitive for data access aware memory management optimizations. For that, users who want such optimizations should run DAMON, read the monitoring results, analyze it, plan a new memory management scheme, and apply the new scheme by themselves. Such efforts will be inevitable for some complicated optimizations. However, in many other cases, the users would simply want the system to apply a memory management action to a memory region of a specific size having a specific access frequency for a specific time. For example, "page out a memory region larger than 100 MiB keeping only rare accesses more than 2 minutes", or "Do not use THP for a memory region larger than 2 MiB rarely accessed for more than 1 seconds". To make the works easier and non-redundant, this patchset implements a new feature of DAMON, which is called Data Access Monitoring-based Operation Schemes (DAMOS). Using the feature, users can describe the normal schemes in a simple way and ask DAMON to execute those on its own. [1] https://damonitor.github.io Evaluations =========== DAMOS is accurate and useful for memory management optimizations. An experimental DAMON-based operation scheme for THP, 'ethp', removes 76.15% of THP memory overheads while preserving 51.25% of THP speedup. Another experimental DAMON-based 'proactive reclamation' implementation, 'prcl', reduces 93.38% of residential sets and 23.63% of system memory footprint while incurring only 1.22% runtime overhead in the best case (parsec3/freqmine). NOTE that the experimental THP optimization and proactive reclamation are not for production but only for proof of concepts. Please refer to the showcase web site's evaluation document[1] for detailed evaluation setup and results. [1] https://damonitor.github.io/doc/html/v34/vm/damon/eval.html Long-term Support Trees ----------------------- For people who want to test DAMON but using LTS kernels, there are another couple of trees based on two latest LTS kernels respectively and containing the 'damon/master' backports. - For v5.4.y: https://git.kernel.org/sj/h/damon/for-v5.4.y - For v5.10.y: https://git.kernel.org/sj/h/damon/for-v5.10.y Sequence Of Patches =================== The 1st patch accounts age of each region. The 2nd patch implements the core of the DAMON-based operation schemes feature. The 3rd patch makes the default monitoring primitives for virtual address spaces to support the schemes. From this point, the kernel space users can use DAMOS. The 4th patch exports the feature to the user space via the debugfs interface. The 5th patch implements schemes statistics feature for easier tuning of the schemes and runtime access pattern analysis, and the 6th patch adds selftests for these changes. Finally, the 7th patch documents this new feature. This patch (of 7): DAMON can be used for data access pattern aware memory management optimizations. For that, users should run DAMON, read the monitoring results, analyze it, plan a new memory management scheme, and apply the new scheme by themselves. It would not be too hard, but still require some level of effort. For complicated cases, this effort is inevitable. That said, in many cases, users would simply want to apply an actions to a memory region of a specific size having a specific access frequency for a specific time. For example, "page out a memory region larger than 100 MiB but having a low access frequency more than 10 minutes", or "Use THP for a memory region larger than 2 MiB having a high access frequency for more than 2 seconds". For such optimizations, users will need to first account the age of each region themselves. To reduce such efforts, this implements a simple age account of each region in DAMON. For each aggregation step, DAMON compares the access frequency with that from last aggregation and reset the age of the region if the change is significant. Else, the age is incremented. Also, in case of the merge of regions, the region size-weighted average of the ages is set as the age of merged new region. Link: https://lkml.kernel.org/r/20211001125604.29660-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211001125604.29660-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Cameron Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Jonathan Corbet Cc: David Hildenbrand Cc: David Woodhouse Cc: Marco Elver Cc: Leonard Foerster Cc: Greg Thelen Cc: Markus Boehme Cc: David Rienjes Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 755d70804705..3e8215debbd4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -31,12 +31,22 @@ struct damon_addr_range { * @sampling_addr: Address of the sample for the next access check. * @nr_accesses: Access frequency of this region. * @list: List head for siblings. + * @age: Age of this region. + * + * @age is initially zero, increased for each aggregation interval, and reset + * to zero again if the access frequency is significantly changed. If two + * regions are merged into a new region, both @nr_accesses and @age of the new + * region are set as region size-weighted average of those of the two regions. */ struct damon_region { struct damon_addr_range ar; unsigned long sampling_addr; unsigned int nr_accesses; struct list_head list; + + unsigned int age; +/* private: Internal value for age calculation. */ + unsigned int last_nr_accesses; }; /** -- cgit v1.2.3 From 1f366e421c8f69583ed37b56d86e3747331869c3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:22 -0700 Subject: mm/damon/core: implement DAMON-based Operation Schemes (DAMOS) In many cases, users might use DAMON for simple data access aware memory management optimizations such as applying an operation scheme to a memory region of a specific size having a specific access frequency for a specific time. For example, "page out a memory region larger than 100 MiB but having a low access frequency more than 10 minutes", or "Use THP for a memory region larger than 2 MiB having a high access frequency for more than 2 seconds". Most simple form of the solution would be doing offline data access pattern profiling using DAMON and modifying the application source code or system configuration based on the profiling results. Or, developing a daemon constructed with two modules (one for access monitoring and the other for applying memory management actions via mlock(), madvise(), sysctl, etc) is imaginable. To avoid users spending their time for implementation of such simple data access monitoring-based operation schemes, this makes DAMON to handle such schemes directly. With this change, users can simply specify their desired schemes to DAMON. Then, DAMON will automatically apply the schemes to the user-specified target processes. Each of the schemes is composed with conditions for filtering of the target memory regions and desired memory management action for the target. Specifically, the format is:: The filtering conditions are size of memory region, number of accesses to the region monitored by DAMON, and the age of the region. The age of region is incremented periodically but reset when its addresses or access frequency has significantly changed or the action of a scheme was applied. For the action, current implementation supports a few of madvise()-like hints, ``WILLNEED``, ``COLD``, ``PAGEOUT``, ``HUGEPAGE``, and ``NOHUGEPAGE``. Because DAMON supports various address spaces and application of the actions to a monitoring target region is dependent to the type of the target address space, the application code should be implemented by each primitives and registered to the framework. Note that this only implements the framework part. Following commit will implement the action applications for virtual address spaces primitives. Link: https://lkml.kernel.org/r/20211001125604.29660-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 3e8215debbd4..dbe18b0fb795 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -69,6 +69,48 @@ struct damon_target { struct list_head list; }; +/** + * enum damos_action - Represents an action of a Data Access Monitoring-based + * Operation Scheme. + * + * @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED. + * @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD. + * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. + * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. + * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + */ +enum damos_action { + DAMOS_WILLNEED, + DAMOS_COLD, + DAMOS_PAGEOUT, + DAMOS_HUGEPAGE, + DAMOS_NOHUGEPAGE, +}; + +/** + * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * @min_sz_region: Minimum size of target regions. + * @max_sz_region: Maximum size of target regions. + * @min_nr_accesses: Minimum ``->nr_accesses`` of target regions. + * @max_nr_accesses: Maximum ``->nr_accesses`` of target regions. + * @min_age_region: Minimum age of target regions. + * @max_age_region: Maximum age of target regions. + * @action: &damo_action to be applied to the target regions. + * @list: List head for siblings. + * + * Note that both the minimums and the maximums are inclusive. + */ +struct damos { + unsigned long min_sz_region; + unsigned long max_sz_region; + unsigned int min_nr_accesses; + unsigned int max_nr_accesses; + unsigned int min_age_region; + unsigned int max_age_region; + enum damos_action action; + struct list_head list; +}; + struct damon_ctx; /** @@ -79,6 +121,7 @@ struct damon_ctx; * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. * @reset_aggregated: Reset aggregated accesses monitoring results. + * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. * @cleanup: Clean up the context. * @@ -104,6 +147,9 @@ struct damon_ctx; * of its update. The value will be used for regions adjustment threshold. * @reset_aggregated should reset the access monitoring results that aggregated * by @check_accesses. + * @apply_scheme is called from @kdamond when a region for user provided + * DAMON-based operation scheme is found. It should apply the scheme's action + * to the region. This is not used for &DAMON_ARBITRARY_TARGET case. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -114,6 +160,8 @@ struct damon_primitive { void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); void (*reset_aggregated)(struct damon_ctx *context); + int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); }; @@ -192,6 +240,7 @@ struct damon_callback { * @min_nr_regions: The minimum number of adaptive monitoring regions. * @max_nr_regions: The maximum number of adaptive monitoring regions. * @adaptive_targets: Head of monitoring targets (&damon_target) list. + * @schemes: Head of schemes (&damos) list. */ struct damon_ctx { unsigned long sample_interval; @@ -213,6 +262,7 @@ struct damon_ctx { unsigned long min_nr_regions; unsigned long max_nr_regions; struct list_head adaptive_targets; + struct list_head schemes; }; #define damon_next_region(r) \ @@ -233,6 +283,12 @@ struct damon_ctx { #define damon_for_each_target_safe(t, next, ctx) \ list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list) +#define damon_for_each_scheme(s, ctx) \ + list_for_each_entry(s, &(ctx)->schemes, list) + +#define damon_for_each_scheme_safe(s, next, ctx) \ + list_for_each_entry_safe(s, next, &(ctx)->schemes, list) + #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); @@ -242,6 +298,14 @@ inline void damon_insert_region(struct damon_region *r, void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); +struct damos *damon_new_scheme( + unsigned long min_sz_region, unsigned long max_sz_region, + unsigned int min_nr_accesses, unsigned int max_nr_accesses, + unsigned int min_age_region, unsigned int max_age_region, + enum damos_action action); +void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); +void damon_destroy_scheme(struct damos *s); + struct damon_target *damon_new_target(unsigned long id); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); void damon_free_target(struct damon_target *t); @@ -255,6 +319,8 @@ int damon_set_targets(struct damon_ctx *ctx, int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_set_schemes(struct damon_ctx *ctx, + struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs); -- cgit v1.2.3 From 6dea8add4d2875b80843e4a4c8acd334a4db8c8f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:25 -0700 Subject: mm/damon/vaddr: support DAMON-based Operation Schemes This makes DAMON's default primitives for virtual address spaces to support DAMON-based Operation Schemes (DAMOS) by implementing actions application functions and registering it to the monitoring context. The implementation simply links 'madvise()' for related DAMOS actions. That is, 'madvise(MADV_WILLNEED)' is called for 'WILLNEED' DAMOS action and similar for other actions ('COLD', 'PAGEOUT', 'HUGEPAGE', 'NOHUGEPAGE'). So, the kernel space DAMON users can now use the DAMON-based optimizations with only small amount of code. Link: https://lkml.kernel.org/r/20211001125604.29660-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index dbe18b0fb795..be6b6e81e8ee 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -337,6 +337,8 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx); unsigned int damon_va_check_accesses(struct damon_ctx *ctx); bool damon_va_target_valid(void *t); void damon_va_cleanup(struct damon_ctx *ctx); +int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); void damon_va_set_primitives(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_VADDR */ -- cgit v1.2.3 From 2f0b548c9f03a78f4ce6ab48986e3108028936a6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:32 -0700 Subject: mm/damon/schemes: implement statistics feature To tune the DAMON-based operation schemes, knowing how many and how large regions are affected by each of the schemes will be helful. Those stats could be used for not only the tuning, but also monitoring of the working set size and the number of regions, if the scheme does not change the program behavior too much. For the reason, this implements the statistics for the schemes. The total number and size of the regions that each scheme is applied are exported to users via '->stat_count' and '->stat_sz' of 'struct damos'. Admins can also check the number by reading 'schemes' debugfs file. The last two integers now represents the stats. To allow collecting the stats without changing the program behavior, this also adds new scheme action, 'DAMOS_STAT'. Note that 'DAMOS_STAT' is not only making no memory operation actions, but also does not reset the age of regions. Link: https://lkml.kernel.org/r/20211001125604.29660-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index be6b6e81e8ee..f301bb53381c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -78,6 +78,7 @@ struct damon_target { * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_STAT: Do nothing but count the stat. */ enum damos_action { DAMOS_WILLNEED, @@ -85,6 +86,7 @@ enum damos_action { DAMOS_PAGEOUT, DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, + DAMOS_STAT, /* Do nothing but only record the stat */ }; /** @@ -96,9 +98,13 @@ enum damos_action { * @min_age_region: Minimum age of target regions. * @max_age_region: Maximum age of target regions. * @action: &damo_action to be applied to the target regions. + * @stat_count: Total number of regions that this scheme is applied. + * @stat_sz: Total size of regions that this scheme is applied. * @list: List head for siblings. * - * Note that both the minimums and the maximums are inclusive. + * For each aggregation interval, DAMON applies @action to monitoring target + * regions fit in the condition and updates the statistics. Note that both + * the minimums and the maximums are inclusive. */ struct damos { unsigned long min_sz_region; @@ -108,6 +114,8 @@ struct damos { unsigned int min_age_region; unsigned int max_age_region; enum damos_action action; + unsigned long stat_count; + unsigned long stat_sz; struct list_head list; }; -- cgit v1.2.3 From a28397beb55b68bd0f15c6778540e8ae1bc26d21 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:46:56 -0700 Subject: mm/damon: implement primitives for physical address space monitoring This implements the monitoring primitives for the physical memory address space. Internally, it uses the PTE Accessed bit, similar to that of the virtual address spaces monitoring primitives. It supports only user memory pages, as idle pages tracking does. If the monitoring target physical memory address range contains non-user memory pages, access check of the pages will do nothing but simply treat the pages as not accessed. Link: https://lkml.kernel.org/r/20211012205711.29216-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Brendan Higgins Cc: David Hildenbrand Cc: David Rienjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index f301bb53381c..715dadd21f7c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -351,4 +351,14 @@ void damon_va_set_primitives(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_VADDR */ +#ifdef CONFIG_DAMON_PADDR + +/* Monitoring primitives for the physical memory address space */ +void damon_pa_prepare_access_checks(struct damon_ctx *ctx); +unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); +bool damon_pa_target_valid(void *t); +void damon_pa_set_primitives(struct damon_ctx *ctx); + +#endif /* CONFIG_DAMON_PADDR */ + #endif /* _DAMON_H */ -- cgit v1.2.3 From 57223ac295845b1d72ec1bd02b5fab992b77a021 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:13 -0700 Subject: mm/damon/paddr: support the pageout scheme Introduction ============ This patchset 1) makes the engine for general data access pattern-oriented memory management (DAMOS) be more useful for production environments, and 2) implements a static kernel module for lightweight proactive reclamation using the engine. Proactive Reclamation --------------------- On general memory over-committed systems, proactively reclaiming cold pages helps saving memory and reducing latency spikes that incurred by the direct reclaim or the CPU consumption of kswapd, while incurring only minimal performance degradation[2]. A Free Pages Reporting[8] based memory over-commit virtualization system would be one more specific use case. In the system, the guest VMs reports their free memory to host, and the host reallocates the reported memory to other guests. As a result, the system's memory utilization can be maximized. However, the guests could be not so memory-frugal, because some kernel subsystems and user-space applications are designed to use as much memory as available. Then, guests would report only small amount of free memory to host, results in poor memory utilization. Running the proactive reclamation in such guests could help mitigating this problem. Google has also implemented this idea and using it in their data center. They further proposed upstreaming it in LSFMM'19, and "the general consensus was that, while this sort of proactive reclaim would be useful for a number of users, the cost of this particular solution was too high to consider merging it upstream"[3]. The cost mainly comes from the coldness tracking. Roughly speaking, the implementation periodically scans the 'Accessed' bit of each page. For the reason, the overhead linearly increases as the size of the memory and the scanning frequency grows. As a result, Google is known to dedicating one CPU for the work. That's a reasonable option to someone like Google, but it wouldn't be so to some others. DAMON and DAMOS: An engine for data access pattern-oriented memory management ----------------------------------------------------------------------------- DAMON[4] is a framework for general data access monitoring. Its adaptive monitoring overhead control feature minimizes its monitoring overhead. It also let the upper-bound of the overhead be configurable by clients, regardless of the size of the monitoring target memory. While monitoring 70 GiB memory of a production system every 5 milliseconds, it consumes less than 1% single CPU time. For this, it could sacrify some of the quality of the monitoring results. Nevertheless, the lower-bound of the quality is configurable, and it uses a best-effort algorithm for better quality. Our test results[5] show the quality is practical enough. From the production system monitoring, we were able to find a 4 KiB region in the 70 GiB memory that shows highest access frequency. We normally don't monitor the data access pattern just for fun but to improve something like memory management. Proactive reclamation is one such usage. For such general cases, DAMON provides a feature called DAMon-based Operation Schemes (DAMOS)[6]. It makes DAMON an engine for general data access pattern oriented memory management. Using this, clients can ask DAMON to find memory regions of specific data access pattern and apply some memory management action (e.g., page out, move to head of the LRU list, use huge page, ...). We call the request 'scheme'. Proactive Reclamation on top of DAMON/DAMOS ------------------------------------------- Therefore, by using DAMON for the cold pages detection, the proactive reclamation's monitoring overhead issue can be solved. Actually, we previously implemented a version of proactive reclamation using DAMOS and achieved noticeable improvements with our evaluation setup[5]. Nevertheless, it more for a proof-of-concept, rather than production uses. It supports only virtual address spaces of processes, and require additional tuning efforts for given workloads and the hardware. For the tuning, we introduced a simple auto-tuning user space tool[8]. Google is also known to using a ML-based similar approach for their fleets[2]. But, making it just works with intuitive knobs in the kernel would be helpful for general users. To this end, this patchset improves DAMOS to be ready for such production usages, and implements another version of the proactive reclamation, namely DAMON_RECLAIM, on top of it. DAMOS Improvements: Aggressiveness Control, Prioritization, and Watermarks -------------------------------------------------------------------------- First of all, the current version of DAMOS supports only virtual address spaces. This patchset makes it supports the physical address space for the page out action. Next major problem of the current version of DAMOS is the lack of the aggressiveness control, which can results in arbitrary overhead. For example, if huge memory regions having the data access pattern of interest are found, applying the requested action to all of the regions could incur significant overhead. It can be controlled by tuning the target data access pattern with manual or automated approaches[2,7]. But, some people would prefer the kernel to just work with only intuitive tuning or default values. For such cases, this patchset implements a safeguard, namely time/size quota. Using this, the clients can specify up to how much time can be used for applying the action, and/or up to how much memory regions the action can be applied within a user-specified time duration. A followup question is, to which memory regions should the action applied within the limits? We implement a simple regions prioritization mechanism for each action and make DAMOS to apply the action to high priority regions first. It also allows clients tune the prioritization mechanism to use different weights for size, access frequency, and age of memory regions. This means we could use not only LRU but also LFU or some fancy algorithms like CAR[9] with lightweight overhead. Though DAMON is lightweight, someone would want to remove even the cold pages monitoring overhead when it is unnecessary. Currently, it should manually turned on and off by clients, but some clients would simply want to turn it on and off based on some metrics like free memory ratio or memory fragmentation. For such cases, this patchset implements a watermarks-based automatic activation feature. It allows the clients configure the metric of their interest, and three watermarks of the metric. If the metric is higher than the high watermark or lower than the low watermark, the scheme is deactivated. If the metric is lower than the mid watermark but higher than the low watermark, the scheme is activated. DAMON-based Reclaim ------------------- Using the improved version of DAMOS, this patchset implements a static kernel module called 'damon_reclaim'. It finds memory regions that didn't accessed for specific time duration and page out. Consuming too much CPU for the paging out operations, or doing pageout too frequently can be critical for systems configuring their swap devices with software-defined in-memory block devices like zram/zswap or total number of writes limited devices like SSDs, respectively. To avoid the problems, the time/size quotas can be configured. Under the quotas, it pages out memory regions that didn't accessed longer first. Also, to remove the monitoring overhead under peaceful situation, and to fall back to the LRU-list based page granularity reclamation when it doesn't make progress, the three watermarks based activation mechanism is used, with the free memory ratio as the watermark metric. For convenient configurations, it provides several module parameters. Using these, sysadmins can enable/disable it, and tune its parameters including the coldness identification time threshold, the time/size quotas and the three watermarks. Evaluation ========== In short, DAMON_RECLAIM with 50ms/s time quota and regions prioritization on v5.15-rc5 Linux kernel with ZRAM swap device achieves 38.58% memory saving with only 1.94% runtime overhead. For this, DAMON_RECLAIM consumes only 4.97% of single CPU time. Setup ----- We evaluate DAMON_RECLAIM to show how each of the DAMOS improvements make effect. For this, we measure DAMON_RECLAIM's CPU consumption, entire system memory footprint, total number of major page faults, and runtime of 24 realistic workloads in PARSEC3 and SPLASH-2X benchmark suites on my QEMU/KVM based virtual machine. The virtual machine runs on an i3.metal AWS instance, has 130GiB memory, and runs a linux kernel built on latest -mm tree[1] plus this patchset. It also utilizes a 4 GiB ZRAM swap device. We repeats the measurement 5 times and use averages. [1] https://github.com/hnaz/linux-mm/tree/v5.15-rc5-mmots-2021-10-13-19-55 Detailed Results ---------------- The results are summarized in the below table. With coldness identification threshold of 5 seconds, DAMON_RECLAIM without the time quota-based speed limit achieves 47.21% memory saving, but incur 4.59% runtime slowdown to the workloads on average. For this, DAMON_RECLAIM consumes about 11.28% single CPU time. Applying time quotas of 200ms/s, 50ms/s, and 10ms/s without the regions prioritization reduces the slowdown to 4.89%, 2.65%, and 1.5%, respectively. Time quota of 200ms/s (20%) makes no real change compared to the quota unapplied version, because the quota unapplied version consumes only 11.28% CPU time. DAMON_RECLAIM's CPU utilization also similarly reduced: 11.24%, 5.51%, and 2.01% of single CPU time. That is, the overhead is proportional to the speed limit. Nevertheless, it also reduces the memory saving because it becomes less aggressive. In detail, the three variants show 48.76%, 37.83%, and 7.85% memory saving, respectively. Applying the regions prioritization (page out regions that not accessed longer first within the time quota) further reduces the performance degradation. Runtime slowdowns and total number of major page faults increase has been 4.89%/218,690% -> 4.39%/166,136% (200ms/s), 2.65%/111,886% -> 1.94%/59,053% (50ms/s), and 1.5%/34,973.40% -> 2.08%/8,781.75% (10ms/s). The runtime under 10ms/s time quota has increased with prioritization, but apparently that's under the margin of error. time quota prioritization memory_saving cpu_util slowdown pgmajfaults overhead N N 47.21% 11.28% 4.59% 194,802% 200ms/s N 48.76% 11.24% 4.89% 218,690% 50ms/s N 37.83% 5.51% 2.65% 111,886% 10ms/s N 7.85% 2.01% 1.5% 34,793.40% 200ms/s Y 50.08% 10.38% 4.39% 166,136% 50ms/s Y 38.58% 4.97% 1.94% 59,053% 10ms/s Y 3.63% 1.73% 2.08% 8,781.75% Baseline and Complete Git Trees =============================== The patches are based on the latest -mm tree (v5.15-rc5-mmots-2021-10-13-19-55). You can also clone the complete git tree from: $ git clone git://github.com/sjp38/linux -b damon_reclaim/patches/v1 The web is also available: https://git.kernel.org/pub/scm/linux/kernel/git/sj/linux.git/tag/?h=damon_reclaim/patches/v1 Sequence Of Patches =================== The first patch makes DAMOS support the physical address space for the page out action. Following five patches (patches 2-6) implement the time/size quotas. Next four patches (patches 7-10) implement the memory regions prioritization within the limit. Then, three following patches (patches 11-13) implement the watermarks-based schemes activation. Finally, the last two patches (patches 14-15) implement and document the DAMON-based reclamation using the advanced DAMOS. [1] https://www.kernel.org/doc/html/v5.15-rc1/vm/damon/index.html [2] https://research.google/pubs/pub48551/ [3] https://lwn.net/Articles/787611/ [4] https://damonitor.github.io [5] https://damonitor.github.io/doc/html/latest/vm/damon/eval.html [6] https://lore.kernel.org/linux-mm/20211001125604.29660-1-sj@kernel.org/ [7] https://github.com/awslabs/damoos [8] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html [9] https://www.usenix.org/conference/fast-04/car-clock-adaptive-replacement This patch (of 15): This makes the DAMON primitives for physical address space support the pageout action for DAMON-based Operation Schemes. With this commit, hence, users can easily implement system-level data access-aware reclamations using DAMOS. [sj@kernel.org: fix missing-prototype build warning] Link: https://lkml.kernel.org/r/20211025064220.13904-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211019150731.16699-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211019150731.16699-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Cameron Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: Jonathan Corbet Cc: David Hildenbrand Cc: David Woodhouse Cc: Marco Elver Cc: Leonard Foerster Cc: Greg Thelen Cc: Markus Boehme Cc: David Rientjes Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 715dadd21f7c..9a327bc787b5 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -357,6 +357,8 @@ void damon_va_set_primitives(struct damon_ctx *ctx); void damon_pa_prepare_access_checks(struct damon_ctx *ctx); unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); bool damon_pa_target_valid(void *t); +int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); void damon_pa_set_primitives(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_PADDR */ -- cgit v1.2.3 From 2b8a248d5873343aa16f6c5ede30517693995f13 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:16 -0700 Subject: mm/damon/schemes: implement size quota for schemes application speed control There could be arbitrarily large memory regions fulfilling the target data access pattern of a DAMON-based operation scheme. In the case, applying the action of the scheme could incur too high overhead. To provide an intuitive way for avoiding it, this implements a feature called size quota. If the quota is set, DAMON tries to apply the action only up to the given amount of memory regions within a given time window. Link: https://lkml.kernel.org/r/20211019150731.16699-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 9a327bc787b5..3a1ce9d9921c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -89,6 +89,26 @@ enum damos_action { DAMOS_STAT, /* Do nothing but only record the stat */ }; +/** + * struct damos_quota - Controls the aggressiveness of the given scheme. + * @sz: Maximum bytes of memory that the action can be applied. + * @reset_interval: Charge reset interval in milliseconds. + * + * To avoid consuming too much CPU time or IO resources for applying the + * &struct damos->action to large memory, DAMON allows users to set a size + * quota. The quota can be set by writing non-zero values to &sz. If the size + * quota is set, DAMON tries to apply the action only up to &sz bytes within + * &reset_interval. + */ +struct damos_quota { + unsigned long sz; + unsigned long reset_interval; + +/* private: For charging the quota */ + unsigned long charged_sz; + unsigned long charged_from; +}; + /** * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @min_sz_region: Minimum size of target regions. @@ -98,13 +118,20 @@ enum damos_action { * @min_age_region: Minimum age of target regions. * @max_age_region: Maximum age of target regions. * @action: &damo_action to be applied to the target regions. + * @quota: Control the aggressiveness of this scheme. * @stat_count: Total number of regions that this scheme is applied. * @stat_sz: Total size of regions that this scheme is applied. * @list: List head for siblings. * - * For each aggregation interval, DAMON applies @action to monitoring target - * regions fit in the condition and updates the statistics. Note that both - * the minimums and the maximums are inclusive. + * For each aggregation interval, DAMON finds regions which fit in the + * condition (&min_sz_region, &max_sz_region, &min_nr_accesses, + * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to + * those. To avoid consuming too much CPU time or IO resources for the + * &action, "a is used. + * + * After applying the &action to each region, &stat_count and &stat_sz is + * updated to reflect the number of regions and total size of regions that the + * &action is applied. */ struct damos { unsigned long min_sz_region; @@ -114,6 +141,7 @@ struct damos { unsigned int min_age_region; unsigned int max_age_region; enum damos_action action; + struct damos_quota quota; unsigned long stat_count; unsigned long stat_sz; struct list_head list; @@ -310,7 +338,7 @@ struct damos *damon_new_scheme( unsigned long min_sz_region, unsigned long max_sz_region, unsigned int min_nr_accesses, unsigned int max_nr_accesses, unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action); + enum damos_action action, struct damos_quota *quota); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); -- cgit v1.2.3 From 50585192bc2ef9309d32dabdbb5e735679f4f128 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:20 -0700 Subject: mm/damon/schemes: skip already charged targets and regions If DAMOS has stopped applying action in the middle of a group of memory regions due to its size quota, it starts the work again from the beginning of the address space in the next charge window. If there is a huge memory region at the beginning of the address space and it fulfills the scheme's target data access pattern always, the action will applied to only the region. This mitigates the case by skipping memory regions that charged in current charge window at the beginning of next charge window. Link: https://lkml.kernel.org/r/20211019150731.16699-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 3a1ce9d9921c..585d985768fd 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -107,6 +107,8 @@ struct damos_quota { /* private: For charging the quota */ unsigned long charged_sz; unsigned long charged_from; + struct damon_target *charge_target_from; + unsigned long charge_addr_from; }; /** @@ -307,6 +309,9 @@ struct damon_ctx { #define damon_prev_region(r) \ (container_of(r->list.prev, struct damon_region, list)) +#define damon_last_region(t) \ + (list_last_entry(&t->regions_list, struct damon_region, list)) + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) -- cgit v1.2.3 From 1cd2430300594a230dba9178ac9e286d868d9da2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:23 -0700 Subject: mm/damon/schemes: implement time quota The size quota feature of DAMOS is useful for IO resource-critical systems, but not so intuitive for CPU time-critical systems. Systems using zram or zswap-like swap device would be examples. To provide another intuitive ways for such systems, this implements time-based quota for DAMON-based Operation Schemes. If the quota is set, DAMOS tries to use only up to the user-defined quota of CPU time within a given time window. Link: https://lkml.kernel.org/r/20211019150731.16699-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 585d985768fd..1e7671bf3d23 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,20 +91,35 @@ enum damos_action { /** * struct damos_quota - Controls the aggressiveness of the given scheme. + * @ms: Maximum milliseconds that the scheme can use. * @sz: Maximum bytes of memory that the action can be applied. * @reset_interval: Charge reset interval in milliseconds. * * To avoid consuming too much CPU time or IO resources for applying the - * &struct damos->action to large memory, DAMON allows users to set a size - * quota. The quota can be set by writing non-zero values to &sz. If the size - * quota is set, DAMON tries to apply the action only up to &sz bytes within - * &reset_interval. + * &struct damos->action to large memory, DAMON allows users to set time and/or + * size quotas. The quotas can be set by writing non-zero values to &ms and + * &sz, respectively. If the time quota is set, DAMON tries to use only up to + * &ms milliseconds within &reset_interval for applying the action. If the + * size quota is set, DAMON tries to apply the action only up to &sz bytes + * within &reset_interval. + * + * Internally, the time quota is transformed to a size quota using estimated + * throughput of the scheme's action. DAMON then compares it against &sz and + * uses smaller one as the effective quota. */ struct damos_quota { + unsigned long ms; unsigned long sz; unsigned long reset_interval; -/* private: For charging the quota */ +/* private: */ + /* For throughput estimation */ + unsigned long total_charged_sz; + unsigned long total_charged_ns; + + unsigned long esz; /* Effective size quota in bytes */ + + /* For charging the quota */ unsigned long charged_sz; unsigned long charged_from; struct damon_target *charge_target_from; -- cgit v1.2.3 From 38683e003153f7abfa612d7b7fe147efa4624af2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:33 -0700 Subject: mm/damon/schemes: prioritize regions within the quotas This makes DAMON apply schemes to regions having higher priority first, if it cannot apply schemes to all regions due to the quotas. The prioritization function should be implemented in the monitoring primitives. Those would commonly calculate the priority of the region using attributes of regions, namely 'size', 'nr_accesses', and 'age'. For example, some primitive would calculate the priority of each region using a weighted sum of 'nr_accesses' and 'age' of the region. The optimal weights would depend on give environments, so this makes those customizable. Nevertheless, the score calculation functions are only encouraged to respect the weights, not mandated. Link: https://lkml.kernel.org/r/20211019150731.16699-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 1e7671bf3d23..5d47ad9e3911 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -14,6 +14,8 @@ /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION PAGE_SIZE +/* Max priority score for DAMON-based operation schemes */ +#define DAMOS_MAX_SCORE (99) /** * struct damon_addr_range - Represents an address region of [@start, @end). @@ -95,6 +97,10 @@ enum damos_action { * @sz: Maximum bytes of memory that the action can be applied. * @reset_interval: Charge reset interval in milliseconds. * + * @weight_sz: Weight of the region's size for prioritization. + * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. + * @weight_age: Weight of the region's age for prioritization. + * * To avoid consuming too much CPU time or IO resources for applying the * &struct damos->action to large memory, DAMON allows users to set time and/or * size quotas. The quotas can be set by writing non-zero values to &ms and @@ -106,12 +112,22 @@ enum damos_action { * Internally, the time quota is transformed to a size quota using estimated * throughput of the scheme's action. DAMON then compares it against &sz and * uses smaller one as the effective quota. + * + * For selecting regions within the quota, DAMON prioritizes current scheme's + * target memory regions using the &struct damon_primitive->get_scheme_score. + * You could customize the prioritization logic by setting &weight_sz, + * &weight_nr_accesses, and &weight_age, because monitoring primitives are + * encouraged to respect those. */ struct damos_quota { unsigned long ms; unsigned long sz; unsigned long reset_interval; + unsigned int weight_sz; + unsigned int weight_nr_accesses; + unsigned int weight_age; + /* private: */ /* For throughput estimation */ unsigned long total_charged_sz; @@ -124,6 +140,10 @@ struct damos_quota { unsigned long charged_from; struct damon_target *charge_target_from; unsigned long charge_addr_from; + + /* For prioritization */ + unsigned long histogram[DAMOS_MAX_SCORE + 1]; + unsigned int min_score; }; /** @@ -174,6 +194,7 @@ struct damon_ctx; * @prepare_access_checks: Prepare next access check of target regions. * @check_accesses: Check the accesses to target regions. * @reset_aggregated: Reset aggregated accesses monitoring results. + * @get_scheme_score: Get the score of a region for a scheme. * @apply_scheme: Apply a DAMON-based operation scheme. * @target_valid: Determine if the target is valid. * @cleanup: Clean up the context. @@ -200,6 +221,8 @@ struct damon_ctx; * of its update. The value will be used for regions adjustment threshold. * @reset_aggregated should reset the access monitoring results that aggregated * by @check_accesses. + * @get_scheme_score should return the priority score of a region for a scheme + * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action * to the region. This is not used for &DAMON_ARBITRARY_TARGET case. @@ -213,6 +236,9 @@ struct damon_primitive { void (*prepare_access_checks)(struct damon_ctx *context); unsigned int (*check_accesses)(struct damon_ctx *context); void (*reset_aggregated)(struct damon_ctx *context); + int (*get_scheme_score)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); bool (*target_valid)(void *target); -- cgit v1.2.3 From 198f0f4c58b9f481e4e51c8c70a6ab9852bbab7f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:37 -0700 Subject: mm/damon/vaddr,paddr: support pageout prioritization This makes the default monitoring primitives for virtual address spaces and the physical address sapce to support memory regions prioritization for 'PAGEOUT' DAMOS action. It calculates hotness of each region as weighted sum of 'nr_accesses' and 'age' of the region and get the priority score as reverse of the hotness, so that cold regions can be paged out first. Link: https://lkml.kernel.org/r/20211019150731.16699-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 5d47ad9e3911..1217566a0ebc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -421,6 +421,8 @@ bool damon_va_target_valid(void *t); void damon_va_cleanup(struct damon_ctx *ctx); int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); +int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); void damon_va_set_primitives(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_VADDR */ @@ -433,6 +435,8 @@ unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); bool damon_pa_target_valid(void *t); int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); +int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, + struct damon_region *r, struct damos *scheme); void damon_pa_set_primitives(struct damon_ctx *ctx); #endif /* CONFIG_DAMON_PADDR */ -- cgit v1.2.3 From ee801b7dd7822a82fd7663048ad649545fac6df3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 5 Nov 2021 13:47:47 -0700 Subject: mm/damon/schemes: activate schemes based on a watermarks mechanism DAMON-based operation schemes need to be manually turned on and off. In some use cases, however, the condition for turning a scheme on and off would depend on the system's situation. For example, schemes for proactive pages reclamation would need to be turned on when some memory pressure is detected, and turned off when the system has enough free memory. For easier control of schemes activation based on the system situation, this introduces a watermarks-based mechanism. The client can describe the watermark metric (e.g., amount of free memory in the system), watermark check interval, and three watermarks, namely high, mid, and low. If the scheme is deactivated, it only gets the metric and compare that to the three watermarks for every check interval. If the metric is higher than the high watermark, the scheme is deactivated. If the metric is between the mid watermark and the low watermark, the scheme is activated. If the metric is lower than the low watermark, the scheme is deactivated again. This is to allow users fall back to traditional page-granularity mechanisms. Link: https://lkml.kernel.org/r/20211019150731.16699-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: Amit Shah Cc: Benjamin Herrenschmidt Cc: David Hildenbrand Cc: David Rientjes Cc: David Woodhouse Cc: Greg Thelen Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Leonard Foerster Cc: Marco Elver Cc: Markus Boehme Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 1217566a0ebc..c93325efddd7 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -146,6 +146,45 @@ struct damos_quota { unsigned int min_score; }; +/** + * enum damos_wmark_metric - Represents the watermark metric. + * + * @DAMOS_WMARK_NONE: Ignore the watermarks of the given scheme. + * @DAMOS_WMARK_FREE_MEM_RATE: Free memory rate of the system in [0,1000]. + */ +enum damos_wmark_metric { + DAMOS_WMARK_NONE, + DAMOS_WMARK_FREE_MEM_RATE, +}; + +/** + * struct damos_watermarks - Controls when a given scheme should be activated. + * @metric: Metric for the watermarks. + * @interval: Watermarks check time interval in microseconds. + * @high: High watermark. + * @mid: Middle watermark. + * @low: Low watermark. + * + * If &metric is &DAMOS_WMARK_NONE, the scheme is always active. Being active + * means DAMON does monitoring and applying the action of the scheme to + * appropriate memory regions. Else, DAMON checks &metric of the system for at + * least every &interval microseconds and works as below. + * + * If &metric is higher than &high, the scheme is inactivated. If &metric is + * between &mid and &low, the scheme is activated. If &metric is lower than + * &low, the scheme is inactivated. + */ +struct damos_watermarks { + enum damos_wmark_metric metric; + unsigned long interval; + unsigned long high; + unsigned long mid; + unsigned long low; + +/* private: */ + bool activated; +}; + /** * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @min_sz_region: Minimum size of target regions. @@ -156,6 +195,7 @@ struct damos_quota { * @max_age_region: Maximum age of target regions. * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. + * @wmarks: Watermarks for automated (in)activation of this scheme. * @stat_count: Total number of regions that this scheme is applied. * @stat_sz: Total size of regions that this scheme is applied. * @list: List head for siblings. @@ -166,6 +206,14 @@ struct damos_quota { * those. To avoid consuming too much CPU time or IO resources for the * &action, "a is used. * + * To do the work only when needed, schemes can be activated for specific + * system situations using &wmarks. If all schemes that registered to the + * monitoring context are inactive, DAMON stops monitoring either, and just + * repeatedly checks the watermarks. + * + * If all schemes that registered to a &struct damon_ctx are inactive, DAMON + * stops monitoring and just repeatedly checks the watermarks. + * * After applying the &action to each region, &stat_count and &stat_sz is * updated to reflect the number of regions and total size of regions that the * &action is applied. @@ -179,6 +227,7 @@ struct damos { unsigned int max_age_region; enum damos_action action; struct damos_quota quota; + struct damos_watermarks wmarks; unsigned long stat_count; unsigned long stat_sz; struct list_head list; @@ -384,7 +433,8 @@ struct damos *damon_new_scheme( unsigned long min_sz_region, unsigned long max_sz_region, unsigned int min_nr_accesses, unsigned int max_nr_accesses, unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action, struct damos_quota *quota); + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); -- cgit v1.2.3 From b5ca3e83ddb05342b1b30700b999cb9b107511f6 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 5 Nov 2021 13:48:07 -0700 Subject: mm/damon/dbgfs: add adaptive_targets list check before enable monitor_on When the ctx->adaptive_targets list is empty, I did some test on monitor_on interface like this. # cat /sys/kernel/debug/damon/target_ids # # echo on > /sys/kernel/debug/damon/monitor_on # damon: kdamond (5390) starts Though the ctx->adaptive_targets list is empty, but the kthread_run still be called, and the kdamond.x thread still be created, this is meaningless. So there adds a judgment in 'dbgfs_monitor_on_write', if the ctx->adaptive_targets list is empty, return -EINVAL. Link: https://lkml.kernel.org/r/0a60a6e8ec9d71989e0848a4dc3311996ca3b5d4.1634720326.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index c93325efddd7..fa7f32614b65 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -440,6 +440,7 @@ void damon_destroy_scheme(struct damos *s); struct damon_target *damon_new_target(unsigned long id); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); +bool damon_targets_empty(struct damon_ctx *ctx); void damon_free_target(struct damon_target *t); void damon_destroy_target(struct damon_target *t); unsigned int damon_nr_regions(struct damon_target *t); -- cgit v1.2.3 From 0f91d13366a402420bf98eaaf393db03946c13e0 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 5 Nov 2021 13:48:22 -0700 Subject: mm/damon: simplify stop mechanism A kernel thread can exit gracefully with kthread_stop(). So we don't need a new flag 'kdamond_stop'. And to make sure the task struct is not freed when accessing it, get reference to it before termination. Link: https://lkml.kernel.org/r/20211027130517.4404-1-changbin.du@gmail.com Signed-off-by: Changbin Du Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index fa7f32614b65..321de9d72360 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -381,7 +381,6 @@ struct damon_ctx { /* public: */ struct task_struct *kdamond; - bool kdamond_stop; struct mutex kdamond_lock; struct damon_primitive primitive; -- cgit v1.2.3 From 658f9ae761b5965893727dd4edcdad56e5a439bb Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 5 Nov 2021 13:48:27 -0700 Subject: mm/damon: remove return value from before_terminate callback Since the return value of 'before_terminate' callback is never used, we make it have no return value. Link: https://lkml.kernel.org/r/20211029005023.8895-1-changbin.du@gmail.com Signed-off-by: Changbin Du Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 321de9d72360..b4d4be3cc987 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -322,7 +322,7 @@ struct damon_callback { int (*before_start)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); - int (*before_terminate)(struct damon_ctx *context); + void (*before_terminate)(struct damon_ctx *context); }; /** -- cgit v1.2.3 From e1c9788cb39777e81ebfbf31ae80b4ec14eb6f6d Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Mon, 27 Sep 2021 19:22:27 +0530 Subject: ceph: don't rely on error_string to validate blocklisted session. The "error_string" in the metadata of MClientSession is being parsed by kclient to validate whether the session is blocklisted. The "error_string" is for humans and shouldn't be relied on it. Hence added the flag to MClientsession to indicate the session is blocklisted. [ jlayton: minor formatting cleanup ] URL: https://tracker.ceph.com/issues/47450 Signed-off-by: Kotresh HR Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index bc2699feddbe..7ad6c3d0db7d 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -302,6 +302,8 @@ enum { CEPH_SESSION_REQUEST_FLUSH_MDLOG, }; +#define CEPH_SESSION_BLOCKLISTED (1 << 0) /* session blocklisted */ + extern const char *ceph_session_op_name(int op); struct ceph_mds_session_head { -- cgit v1.2.3 From aca39d9e86f3edeaac5d2c467f5fd31e0b0df606 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= Date: Thu, 4 Nov 2021 12:31:46 +0000 Subject: libceph, ceph: move ceph_osdc_copy_from() into cephfs code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch moves ceph_osdc_copy_from() function out of libceph code into cephfs. There are no other users for this function, and there is the need (in another patch) to access internal ceph_osd_request struct members. Signed-off-by: LuĂ­s Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 83fa08a06507..3431011f364d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -475,6 +475,14 @@ extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, u64 expected_object_size, u64 expected_write_size, u32 flags); +extern int osd_req_op_copy_from_init(struct ceph_osd_request *req, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, + u8 copy_from_flags); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, @@ -515,17 +523,6 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, struct page *req_page, size_t req_len, struct page **resp_pages, size_t *resp_len); -int ceph_osdc_copy_from(struct ceph_osd_client *osdc, - u64 src_snapid, u64 src_version, - struct ceph_object_id *src_oid, - struct ceph_object_locator *src_oloc, - u32 src_fadvise_flags, - struct ceph_object_id *dst_oid, - struct ceph_object_locator *dst_oloc, - u32 dst_fadvise_flags, - u32 truncate_seq, u64 truncate_size, - u8 copy_from_flags); - /* watch/notify */ struct ceph_osd_linger_request * ceph_osdc_watch(struct ceph_osd_client *osdc, -- cgit v1.2.3 From c6975d7cab5b903aadbc0f78f9af4fae1bd23a50 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 5 Nov 2021 11:05:09 -0400 Subject: arm64: Track no early_pgtable_alloc() for kmemleak After switched page size from 64KB to 4KB on several arm64 servers here, kmemleak starts to run out of early memory pool due to a huge number of those early_pgtable_alloc() calls: kmemleak_alloc_phys() memblock_alloc_range_nid() memblock_phys_alloc_range() early_pgtable_alloc() init_pmd() alloc_init_pud() __create_pgd_mapping() __map_memblock() paging_init() setup_arch() start_kernel() Increased the default value of DEBUG_KMEMLEAK_MEM_POOL_SIZE by 4 times won't be enough for a server with 200GB+ memory. There isn't much interesting to check memory leaks for those early page tables and those early memory mappings should not reference to other memory. Hence, no kmemleak false positives, and we can safely skip tracking those early allocations from kmemleak like we did in the commit fed84c785270 ("mm/memblock.c: skip kmemleak for kasan_init()") without needing to introduce complications to automatically scale the value depends on the runtime memory size etc. After the patch, the default value of DEBUG_KMEMLEAK_MEM_POOL_SIZE becomes sufficient again. Signed-off-by: Qian Cai Reviewed-by: Catalin Marinas Reviewed-by: Mike Rapoport Link: https://lore.kernel.org/r/20211105150509.7826-1-quic_qiancai@quicinc.com Signed-off-by: Will Deacon --- include/linux/memblock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 34de69b3b8ba..efc896155dae 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -372,7 +372,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r) /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 -#define MEMBLOCK_ALLOC_KASAN 1 +#define MEMBLOCK_ALLOC_NOLEAKTRACE 1 /* We are using top down, so it is safe to use 0 here */ #define MEMBLOCK_LOW_LIMIT 0 -- cgit v1.2.3 From dfd5bb23ad75bdabde89ac3166705a450bf16acb Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 5 May 2021 14:00:06 +0200 Subject: PCI: Export pci_dev_lock() Commit e3a9b1212b9d ("PCI: Export pci_dev_trylock() and pci_dev_unlock()") already exported pci_dev_trylock()/pci_dev_unlock() however in some circumstances such as during error recovery it makes sense to block waiting to get full access to the device so also export pci_dev_lock(). Link: https://lore.kernel.org/all/20210928181014.GA713179@bhelgaas/ Acked-by: Pierre Morel Acked-by: Bjorn Helgaas Signed-off-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index b4dbcc86b3f1..d307b071b65e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1664,6 +1664,7 @@ void pci_cfg_access_lock(struct pci_dev *dev); bool pci_cfg_access_trylock(struct pci_dev *dev); void pci_cfg_access_unlock(struct pci_dev *dev); +void pci_dev_lock(struct pci_dev *dev); int pci_dev_trylock(struct pci_dev *dev); void pci_dev_unlock(struct pci_dev *dev); -- cgit v1.2.3 From 40a34121ac1dc52ed9cd34a8f4e48e32517a52fd Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Nov 2021 13:47:32 -0700 Subject: bpf, sockmap: Use stricter sk state checks in sk_lookup_assign In order to fix an issue with sockets in TCP sockmap redirect cases we plan to allow CLOSE state sockets to exist in the sockmap. However, the check in bpf_sk_lookup_assign() currently only invalidates sockets in the TCP_ESTABLISHED case relying on the checks on sockmap insert to ensure we never SOCK_CLOSE state sockets in the map. To prepare for this change we flip the logic in bpf_sk_lookup_assign() to explicitly test for the accepted cases. Namely, a tcp socket in TCP_LISTEN or a udp socket in TCP_CLOSE state. This also makes the code more resilent to future changes. Suggested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20211103204736.248403-2-john.fastabend@gmail.com --- include/linux/skmsg.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index b4256847c707..584d94be9c8b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -507,6 +507,18 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } +static inline bool sk_is_tcp(const struct sock *sk) +{ + return sk->sk_type == SOCK_STREAM && + sk->sk_protocol == IPPROTO_TCP; +} + +static inline bool sk_is_udp(const struct sock *sk) +{ + return sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP; +} + #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) -- cgit v1.2.3 From cf30f6a5f0c60ec98a637b836bef6915f602c6ab Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Fri, 11 Sep 2020 16:49:00 -0700 Subject: lib: zstd: Add kernel-specific API This patch: - Moves `include/linux/zstd.h` -> `include/linux/zstd_lib.h` - Updates modified zstd headers to yearless copyright - Adds a new API in `include/linux/zstd.h` that is functionally equivalent to the in-use subset of the current API. Functions are renamed to avoid symbol collisions with zstd, to make it clear it is not the upstream zstd API, and to follow the kernel style guide. - Updates all callers to use the new API. There are no functional changes in this patch. Since there are no functional change, I felt it was okay to update all the callers in a single patch. Once the API is approved, the callers are mechanically changed. This patch is preparing for the 3rd patch in this series, which updates zstd to version 1.4.10. Since the upstream zstd API is no longer exposed to callers, the update can happen transparently. Signed-off-by: Nick Terrell Tested By: Paul Jones Tested-by: Oleksandr Natalenko Tested-by: Sedat Dilek # LLVM/Clang v13.0.0 on x86-64 Tested-by: Jean-Denis Girard --- include/linux/zstd.h | 1243 ++++++++++------------------------------------ include/linux/zstd_lib.h | 1157 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1420 insertions(+), 980 deletions(-) create mode 100644 include/linux/zstd_lib.h (limited to 'include/linux') diff --git a/include/linux/zstd.h b/include/linux/zstd.h index e87f78c9b19c..9fbc7729b0a0 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -1,138 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of https://github.com/facebook/zstd. - * An additional grant of patent rights can be found in the PATENTS file in the - * same directory. - * - * This program is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License version 2 as published by the - * Free Software Foundation. This program is dual-licensed; you may select - * either version 2 of the GNU General Public License ("GPL") or BSD license - * ("BSD"). + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of https://github.com/facebook/zstd) and + * the GPLv2 (found in the COPYING file in the root directory of + * https://github.com/facebook/zstd). You may select, at your option, one of the + * above-listed licenses. */ -#ifndef ZSTD_H -#define ZSTD_H +#ifndef LINUX_ZSTD_H +#define LINUX_ZSTD_H -/* ====== Dependency ======*/ -#include /* size_t */ +/** + * This is a kernel-style API that wraps the upstream zstd API, which cannot be + * used directly because the symbols aren't exported. It exposes the minimal + * functionality which is currently required by users of zstd in the kernel. + * Expose extra functions from lib/zstd/zstd.h as needed. + */ +/* ====== Dependency ====== */ +#include +#include -/*-***************************************************************************** - * Introduction +/* ====== Helper Functions ====== */ +/** + * zstd_compress_bound() - maximum compressed size in worst case scenario + * @src_size: The size of the data to compress. * - * zstd, short for Zstandard, is a fast lossless compression algorithm, - * targeting real-time compression scenarios at zlib-level and better - * compression ratios. The zstd compression library provides in-memory - * compression and decompression functions. The library supports compression - * levels from 1 up to ZSTD_maxCLevel() which is 22. Levels >= 20, labeled - * ultra, should be used with caution, as they require more memory. - * Compression can be done in: - * - a single step, reusing a context (described as Explicit memory management) - * - unbounded multiple steps (described as Streaming compression) - * The compression ratio achievable on small data can be highly improved using - * compression with a dictionary in: - * - a single step (described as Simple dictionary API) - * - a single step, reusing a dictionary (described as Fast dictionary API) - ******************************************************************************/ - -/*====== Helper functions ======*/ + * Return: The maximum compressed size in the worst case scenario. + */ +size_t zstd_compress_bound(size_t src_size); /** - * enum ZSTD_ErrorCode - zstd error codes + * zstd_is_error() - tells if a size_t function result is an error code + * @code: The function result to check for error. * - * Functions that return size_t can be checked for errors using ZSTD_isError() - * and the ZSTD_ErrorCode can be extracted using ZSTD_getErrorCode(). + * Return: Non-zero iff the code is an error. + */ +unsigned int zstd_is_error(size_t code); + +/** + * enum zstd_error_code - zstd error codes */ -typedef enum { - ZSTD_error_no_error, - ZSTD_error_GENERIC, - ZSTD_error_prefix_unknown, - ZSTD_error_version_unsupported, - ZSTD_error_parameter_unknown, - ZSTD_error_frameParameter_unsupported, - ZSTD_error_frameParameter_unsupportedBy32bits, - ZSTD_error_frameParameter_windowTooLarge, - ZSTD_error_compressionParameter_unsupported, - ZSTD_error_init_missing, - ZSTD_error_memory_allocation, - ZSTD_error_stage_wrong, - ZSTD_error_dstSize_tooSmall, - ZSTD_error_srcSize_wrong, - ZSTD_error_corruption_detected, - ZSTD_error_checksum_wrong, - ZSTD_error_tableLog_tooLarge, - ZSTD_error_maxSymbolValue_tooLarge, - ZSTD_error_maxSymbolValue_tooSmall, - ZSTD_error_dictionary_corrupted, - ZSTD_error_dictionary_wrong, - ZSTD_error_dictionaryCreation_failed, - ZSTD_error_maxCode -} ZSTD_ErrorCode; +typedef ZSTD_ErrorCode zstd_error_code; /** - * ZSTD_maxCLevel() - maximum compression level available + * zstd_get_error_code() - translates an error function result to an error code + * @code: The function result for which zstd_is_error(code) is true. * - * Return: Maximum compression level available. + * Return: A unique error code for this error. */ -int ZSTD_maxCLevel(void); +zstd_error_code zstd_get_error_code(size_t code); + /** - * ZSTD_compressBound() - maximum compressed size in worst case scenario - * @srcSize: The size of the data to compress. + * zstd_get_error_name() - translates an error function result to a string + * @code: The function result for which zstd_is_error(code) is true. * - * Return: The maximum compressed size in the worst case scenario. + * Return: An error string corresponding to the error code. */ -size_t ZSTD_compressBound(size_t srcSize); +const char *zstd_get_error_name(size_t code); + /** - * ZSTD_isError() - tells if a size_t function result is an error code - * @code: The function result to check for error. + * zstd_min_clevel() - minimum allowed compression level * - * Return: Non-zero iff the code is an error. + * Return: The minimum allowed compression level. */ -static __attribute__((unused)) unsigned int ZSTD_isError(size_t code) -{ - return code > (size_t)-ZSTD_error_maxCode; -} +int zstd_min_clevel(void); + /** - * ZSTD_getErrorCode() - translates an error function result to a ZSTD_ErrorCode - * @functionResult: The result of a function for which ZSTD_isError() is true. + * zstd_max_clevel() - maximum allowed compression level * - * Return: The ZSTD_ErrorCode corresponding to the functionResult or 0 - * if the functionResult isn't an error. + * Return: The maximum allowed compression level. */ -static __attribute__((unused)) ZSTD_ErrorCode ZSTD_getErrorCode( - size_t functionResult) -{ - if (!ZSTD_isError(functionResult)) - return (ZSTD_ErrorCode)0; - return (ZSTD_ErrorCode)(0 - functionResult); -} +int zstd_max_clevel(void); + +/* ====== Parameter Selection ====== */ /** - * enum ZSTD_strategy - zstd compression search strategy + * enum zstd_strategy - zstd compression search strategy * - * From faster to stronger. + * From faster to stronger. See zstd_lib.h. */ -typedef enum { - ZSTD_fast, - ZSTD_dfast, - ZSTD_greedy, - ZSTD_lazy, - ZSTD_lazy2, - ZSTD_btlazy2, - ZSTD_btopt, - ZSTD_btopt2 -} ZSTD_strategy; +typedef ZSTD_strategy zstd_strategy; /** - * struct ZSTD_compressionParameters - zstd compression parameters + * struct zstd_compression_parameters - zstd compression parameters * @windowLog: Log of the largest match distance. Larger means more * compression, and more memory needed during decompression. - * @chainLog: Fully searched segment. Larger means more compression, slower, - * and more memory (useless for fast). + * @chainLog: Fully searched segment. Larger means more compression, + * slower, and more memory (useless for fast). * @hashLog: Dispatch table. Larger means more compression, * slower, and more memory. * @searchLog: Number of searches. Larger means more compression and slower. @@ -141,1017 +99,342 @@ typedef enum { * @targetLength: Acceptable match size for optimal parser (only). Larger means * more compression, and slower. * @strategy: The zstd compression strategy. + * + * See zstd_lib.h. */ -typedef struct { - unsigned int windowLog; - unsigned int chainLog; - unsigned int hashLog; - unsigned int searchLog; - unsigned int searchLength; - unsigned int targetLength; - ZSTD_strategy strategy; -} ZSTD_compressionParameters; +typedef ZSTD_compressionParameters zstd_compression_parameters; /** - * struct ZSTD_frameParameters - zstd frame parameters - * @contentSizeFlag: Controls whether content size will be present in the frame - * header (when known). - * @checksumFlag: Controls whether a 32-bit checksum is generated at the end - * of the frame for error detection. - * @noDictIDFlag: Controls whether dictID will be saved into the frame header - * when using dictionary compression. + * struct zstd_frame_parameters - zstd frame parameters + * @contentSizeFlag: Controls whether content size will be present in the + * frame header (when known). + * @checksumFlag: Controls whether a 32-bit checksum is generated at the + * end of the frame for error detection. + * @noDictIDFlag: Controls whether dictID will be saved into the frame + * header when using dictionary compression. * - * The default value is all fields set to 0. + * The default value is all fields set to 0. See zstd_lib.h. */ -typedef struct { - unsigned int contentSizeFlag; - unsigned int checksumFlag; - unsigned int noDictIDFlag; -} ZSTD_frameParameters; +typedef ZSTD_frameParameters zstd_frame_parameters; /** - * struct ZSTD_parameters - zstd parameters + * struct zstd_parameters - zstd parameters * @cParams: The compression parameters. * @fParams: The frame parameters. */ -typedef struct { - ZSTD_compressionParameters cParams; - ZSTD_frameParameters fParams; -} ZSTD_parameters; +typedef ZSTD_parameters zstd_parameters; /** - * ZSTD_getCParams() - returns ZSTD_compressionParameters for selected level - * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel(). - * @estimatedSrcSize: The estimated source size to compress or 0 if unknown. - * @dictSize: The dictionary size or 0 if a dictionary isn't being used. + * zstd_get_params() - returns zstd_parameters for selected level + * @level: The compression level + * @estimated_src_size: The estimated source size to compress or 0 + * if unknown. * - * Return: The selected ZSTD_compressionParameters. + * Return: The selected zstd_parameters. */ -ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, - unsigned long long estimatedSrcSize, size_t dictSize); +zstd_parameters zstd_get_params(int level, + unsigned long long estimated_src_size); -/** - * ZSTD_getParams() - returns ZSTD_parameters for selected level - * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel(). - * @estimatedSrcSize: The estimated source size to compress or 0 if unknown. - * @dictSize: The dictionary size or 0 if a dictionary isn't being used. - * - * The same as ZSTD_getCParams() except also selects the default frame - * parameters (all zero). - * - * Return: The selected ZSTD_parameters. - */ -ZSTD_parameters ZSTD_getParams(int compressionLevel, - unsigned long long estimatedSrcSize, size_t dictSize); +/* ====== Single-pass Compression ====== */ -/*-************************************* - * Explicit memory management - **************************************/ +typedef ZSTD_CCtx zstd_cctx; /** - * ZSTD_CCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_CCtx - * @cParams: The compression parameters to be used for compression. + * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx + * @parameters: The compression parameters to be used. * * If multiple compression parameters might be used, the caller must call - * ZSTD_CCtxWorkspaceBound() for each set of parameters and use the maximum + * zstd_cctx_workspace_bound() for each set of parameters and use the maximum * size. * - * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initCCtx(). + * Return: A lower bound on the size of the workspace that is passed to + * zstd_init_cctx(). */ -size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams); +size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters); /** - * struct ZSTD_CCtx - the zstd compression context - * - * When compressing many times it is recommended to allocate a context just once - * and reuse it for each successive compression operation. - */ -typedef struct ZSTD_CCtx_s ZSTD_CCtx; -/** - * ZSTD_initCCtx() - initialize a zstd compression context - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. Use ZSTD_CCtxWorkspaceBound() to - * determine how large the workspace must be. - * - * Return: A compression context emplaced into workspace. - */ -ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize); - -/** - * ZSTD_compressCCtx() - compress src into dst - * @ctx: The context. Must have been initialized with a workspace at - * least as large as ZSTD_CCtxWorkspaceBound(params.cParams). - * @dst: The buffer to compress src into. - * @dstCapacity: The size of the destination buffer. May be any size, but - * ZSTD_compressBound(srcSize) is guaranteed to be large enough. - * @src: The data to compress. - * @srcSize: The size of the data to compress. - * @params: The parameters to use for compression. See ZSTD_getParams(). - * - * Return: The compressed size or an error, which can be checked using - * ZSTD_isError(). - */ -size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize, ZSTD_parameters params); - -/** - * ZSTD_DCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_DCtx - * - * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initDCtx(). - */ -size_t ZSTD_DCtxWorkspaceBound(void); - -/** - * struct ZSTD_DCtx - the zstd decompression context - * - * When decompressing many times it is recommended to allocate a context just - * once and reuse it for each successive decompression operation. - */ -typedef struct ZSTD_DCtx_s ZSTD_DCtx; -/** - * ZSTD_initDCtx() - initialize a zstd decompression context - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. Use ZSTD_DCtxWorkspaceBound() to - * determine how large the workspace must be. - * - * Return: A decompression context emplaced into workspace. - */ -ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize); - -/** - * ZSTD_decompressDCtx() - decompress zstd compressed src into dst - * @ctx: The decompression context. - * @dst: The buffer to decompress src into. - * @dstCapacity: The size of the destination buffer. Must be at least as large - * as the decompressed size. If the caller cannot upper bound the - * decompressed size, then it's better to use the streaming API. - * @src: The zstd compressed data to decompress. Multiple concatenated - * frames and skippable frames are allowed. - * @srcSize: The exact size of the data to decompress. - * - * Return: The decompressed size or an error, which can be checked using - * ZSTD_isError(). - */ -size_t ZSTD_decompressDCtx(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); - -/*-************************ - * Simple dictionary API - **************************/ - -/** - * ZSTD_compress_usingDict() - compress src into dst using a dictionary - * @ctx: The context. Must have been initialized with a workspace at - * least as large as ZSTD_CCtxWorkspaceBound(params.cParams). - * @dst: The buffer to compress src into. - * @dstCapacity: The size of the destination buffer. May be any size, but - * ZSTD_compressBound(srcSize) is guaranteed to be large enough. - * @src: The data to compress. - * @srcSize: The size of the data to compress. - * @dict: The dictionary to use for compression. - * @dictSize: The size of the dictionary. - * @params: The parameters to use for compression. See ZSTD_getParams(). - * - * Compression using a predefined dictionary. The same dictionary must be used - * during decompression. - * - * Return: The compressed size or an error, which can be checked using - * ZSTD_isError(). - */ -size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize, const void *dict, size_t dictSize, - ZSTD_parameters params); - -/** - * ZSTD_decompress_usingDict() - decompress src into dst using a dictionary - * @ctx: The decompression context. - * @dst: The buffer to decompress src into. - * @dstCapacity: The size of the destination buffer. Must be at least as large - * as the decompressed size. If the caller cannot upper bound the - * decompressed size, then it's better to use the streaming API. - * @src: The zstd compressed data to decompress. Multiple concatenated - * frames and skippable frames are allowed. - * @srcSize: The exact size of the data to decompress. - * @dict: The dictionary to use for decompression. The same dictionary - * must've been used to compress the data. - * @dictSize: The size of the dictionary. - * - * Return: The decompressed size or an error, which can be checked using - * ZSTD_isError(). - */ -size_t ZSTD_decompress_usingDict(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize, const void *dict, size_t dictSize); - -/*-************************** - * Fast dictionary API - ***************************/ - -/** - * ZSTD_CDictWorkspaceBound() - memory needed to initialize a ZSTD_CDict - * @cParams: The compression parameters to be used for compression. + * zstd_init_cctx() - initialize a zstd compression context + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspace_size: The size of workspace. Use zstd_cctx_workspace_bound() to + * determine how large the workspace must be. * - * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initCDict(). - */ -size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams); - -/** - * struct ZSTD_CDict - a digested dictionary to be used for compression + * Return: A zstd compression context or NULL on error. */ -typedef struct ZSTD_CDict_s ZSTD_CDict; +zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size); /** - * ZSTD_initCDict() - initialize a digested dictionary for compression - * @dictBuffer: The dictionary to digest. The buffer is referenced by the - * ZSTD_CDict so it must outlive the returned ZSTD_CDict. - * @dictSize: The size of the dictionary. - * @params: The parameters to use for compression. See ZSTD_getParams(). - * @workspace: The workspace. It must outlive the returned ZSTD_CDict. - * @workspaceSize: The workspace size. Must be at least - * ZSTD_CDictWorkspaceBound(params.cParams). + * zstd_compress_cctx() - compress src into dst with the initialized parameters + * @cctx: The context. Must have been initialized with zstd_init_cctx(). + * @dst: The buffer to compress src into. + * @dst_capacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @src_size: The size of the data to compress. + * @parameters: The compression parameters to be used. * - * When compressing multiple messages / blocks with the same dictionary it is - * recommended to load it just once. The ZSTD_CDict merely references the - * dictBuffer, so it must outlive the returned ZSTD_CDict. - * - * Return: The digested dictionary emplaced into workspace. + * Return: The compressed size or an error, which can be checked using + * zstd_is_error(). */ -ZSTD_CDict *ZSTD_initCDict(const void *dictBuffer, size_t dictSize, - ZSTD_parameters params, void *workspace, size_t workspaceSize); +size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, + const void *src, size_t src_size, const zstd_parameters *parameters); -/** - * ZSTD_compress_usingCDict() - compress src into dst using a ZSTD_CDict - * @ctx: The context. Must have been initialized with a workspace at - * least as large as ZSTD_CCtxWorkspaceBound(cParams) where - * cParams are the compression parameters used to initialize the - * cdict. - * @dst: The buffer to compress src into. - * @dstCapacity: The size of the destination buffer. May be any size, but - * ZSTD_compressBound(srcSize) is guaranteed to be large enough. - * @src: The data to compress. - * @srcSize: The size of the data to compress. - * @cdict: The digested dictionary to use for compression. - * @params: The parameters to use for compression. See ZSTD_getParams(). - * - * Compression using a digested dictionary. The same dictionary must be used - * during decompression. - * - * Return: The compressed size or an error, which can be checked using - * ZSTD_isError(). - */ -size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize, const ZSTD_CDict *cdict); +/* ====== Single-pass Decompression ====== */ +typedef ZSTD_DCtx zstd_dctx; /** - * ZSTD_DDictWorkspaceBound() - memory needed to initialize a ZSTD_DDict + * zstd_dctx_workspace_bound() - max memory needed to initialize a zstd_dctx * - * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initDDict(). - */ -size_t ZSTD_DDictWorkspaceBound(void); - -/** - * struct ZSTD_DDict - a digested dictionary to be used for decompression + * Return: A lower bound on the size of the workspace that is passed to + * zstd_init_dctx(). */ -typedef struct ZSTD_DDict_s ZSTD_DDict; +size_t zstd_dctx_workspace_bound(void); /** - * ZSTD_initDDict() - initialize a digested dictionary for decompression - * @dictBuffer: The dictionary to digest. The buffer is referenced by the - * ZSTD_DDict so it must outlive the returned ZSTD_DDict. - * @dictSize: The size of the dictionary. - * @workspace: The workspace. It must outlive the returned ZSTD_DDict. - * @workspaceSize: The workspace size. Must be at least - * ZSTD_DDictWorkspaceBound(). - * - * When decompressing multiple messages / blocks with the same dictionary it is - * recommended to load it just once. The ZSTD_DDict merely references the - * dictBuffer, so it must outlive the returned ZSTD_DDict. + * zstd_init_dctx() - initialize a zstd decompression context + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspace_size: The size of workspace. Use zstd_dctx_workspace_bound() to + * determine how large the workspace must be. * - * Return: The digested dictionary emplaced into workspace. + * Return: A zstd decompression context or NULL on error. */ -ZSTD_DDict *ZSTD_initDDict(const void *dictBuffer, size_t dictSize, - void *workspace, size_t workspaceSize); +zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size); /** - * ZSTD_decompress_usingDDict() - decompress src into dst using a ZSTD_DDict - * @ctx: The decompression context. - * @dst: The buffer to decompress src into. - * @dstCapacity: The size of the destination buffer. Must be at least as large - * as the decompressed size. If the caller cannot upper bound the - * decompressed size, then it's better to use the streaming API. - * @src: The zstd compressed data to decompress. Multiple concatenated - * frames and skippable frames are allowed. - * @srcSize: The exact size of the data to decompress. - * @ddict: The digested dictionary to use for decompression. The same - * dictionary must've been used to compress the data. + * zstd_decompress_dctx() - decompress zstd compressed src into dst + * @dctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dst_capacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @src_size: The exact size of the data to decompress. * - * Return: The decompressed size or an error, which can be checked using - * ZSTD_isError(). + * Return: The decompressed size or an error, which can be checked using + * zstd_is_error(). */ -size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst, - size_t dstCapacity, const void *src, size_t srcSize, - const ZSTD_DDict *ddict); +size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity, + const void *src, size_t src_size); - -/*-************************** - * Streaming - ***************************/ +/* ====== Streaming Buffers ====== */ /** - * struct ZSTD_inBuffer - input buffer for streaming + * struct zstd_in_buffer - input buffer for streaming * @src: Start of the input buffer. * @size: Size of the input buffer. * @pos: Position where reading stopped. Will be updated. * Necessarily 0 <= pos <= size. + * + * See zstd_lib.h. */ -typedef struct ZSTD_inBuffer_s { - const void *src; - size_t size; - size_t pos; -} ZSTD_inBuffer; +typedef ZSTD_inBuffer zstd_in_buffer; /** - * struct ZSTD_outBuffer - output buffer for streaming + * struct zstd_out_buffer - output buffer for streaming * @dst: Start of the output buffer. * @size: Size of the output buffer. * @pos: Position where writing stopped. Will be updated. * Necessarily 0 <= pos <= size. + * + * See zstd_lib.h. */ -typedef struct ZSTD_outBuffer_s { - void *dst; - size_t size; - size_t pos; -} ZSTD_outBuffer; +typedef ZSTD_outBuffer zstd_out_buffer; +/* ====== Streaming Compression ====== */ - -/*-***************************************************************************** - * Streaming compression - HowTo - * - * A ZSTD_CStream object is required to track streaming operation. - * Use ZSTD_initCStream() to initialize a ZSTD_CStream object. - * ZSTD_CStream objects can be reused multiple times on consecutive compression - * operations. It is recommended to re-use ZSTD_CStream in situations where many - * streaming operations will be achieved consecutively. Use one separate - * ZSTD_CStream per thread for parallel execution. - * - * Use ZSTD_compressStream() repetitively to consume input stream. - * The function will automatically update both `pos` fields. - * Note that it may not consume the entire input, in which case `pos < size`, - * and it's up to the caller to present again remaining data. - * It returns a hint for the preferred number of bytes to use as an input for - * the next function call. - * - * At any moment, it's possible to flush whatever data remains within internal - * buffer, using ZSTD_flushStream(). `output->pos` will be updated. There might - * still be some content left within the internal buffer if `output->size` is - * too small. It returns the number of bytes left in the internal buffer and - * must be called until it returns 0. - * - * ZSTD_endStream() instructs to finish a frame. It will perform a flush and - * write frame epilogue. The epilogue is required for decoders to consider a - * frame completed. Similar to ZSTD_flushStream(), it may not be able to flush - * the full content if `output->size` is too small. In which case, call again - * ZSTD_endStream() to complete the flush. It returns the number of bytes left - * in the internal buffer and must be called until it returns 0. - ******************************************************************************/ +typedef ZSTD_CStream zstd_cstream; /** - * ZSTD_CStreamWorkspaceBound() - memory needed to initialize a ZSTD_CStream - * @cParams: The compression parameters to be used for compression. + * zstd_cstream_workspace_bound() - memory needed to initialize a zstd_cstream + * @cparams: The compression parameters to be used for compression. * * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initCStream() and ZSTD_initCStream_usingCDict(). - */ -size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams); - -/** - * struct ZSTD_CStream - the zstd streaming compression context + * zstd_init_cstream(). */ -typedef struct ZSTD_CStream_s ZSTD_CStream; +size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams); -/*===== ZSTD_CStream management functions =====*/ /** - * ZSTD_initCStream() - initialize a zstd streaming compression context - * @params: The zstd compression parameters. - * @pledgedSrcSize: If params.fParams.contentSizeFlag == 1 then the caller must - * pass the source size (zero means empty source). Otherwise, - * the caller may optionally pass the source size, or zero if - * unknown. - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. - * Use ZSTD_CStreamWorkspaceBound(params.cParams) to determine - * how large the workspace must be. + * zstd_init_cstream() - initialize a zstd streaming compression context + * @parameters The zstd parameters to use for compression. + * @pledged_src_size: If params.fParams.contentSizeFlag == 1 then the caller + * must pass the source size (zero means empty source). + * Otherwise, the caller may optionally pass the source + * size, or zero if unknown. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspace_size: The size of workspace. + * Use zstd_cstream_workspace_bound(params->cparams) to + * determine how large the workspace must be. * - * Return: The zstd streaming compression context. + * Return: The zstd streaming compression context or NULL on error. */ -ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params, - unsigned long long pledgedSrcSize, void *workspace, - size_t workspaceSize); +zstd_cstream *zstd_init_cstream(const zstd_parameters *parameters, + unsigned long long pledged_src_size, void *workspace, size_t workspace_size); /** - * ZSTD_initCStream_usingCDict() - initialize a streaming compression context - * @cdict: The digested dictionary to use for compression. - * @pledgedSrcSize: Optionally the source size, or zero if unknown. - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. Call ZSTD_CStreamWorkspaceBound() - * with the cParams used to initialize the cdict to determine - * how large the workspace must be. - * - * Return: The zstd streaming compression context. - */ -ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict, - unsigned long long pledgedSrcSize, void *workspace, - size_t workspaceSize); - -/*===== Streaming compression functions =====*/ -/** - * ZSTD_resetCStream() - reset the context using parameters from creation - * @zcs: The zstd streaming compression context to reset. - * @pledgedSrcSize: Optionally the source size, or zero if unknown. + * zstd_reset_cstream() - reset the context using parameters from creation + * @cstream: The zstd streaming compression context to reset. + * @pledged_src_size: Optionally the source size, or zero if unknown. * * Resets the context using the parameters from creation. Skips dictionary - * loading, since it can be reused. If `pledgedSrcSize` is non-zero the frame + * loading, since it can be reused. If `pledged_src_size` is non-zero the frame * content size is always written into the frame header. * - * Return: Zero or an error, which can be checked using ZSTD_isError(). + * Return: Zero or an error, which can be checked using + * zstd_is_error(). */ -size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize); +size_t zstd_reset_cstream(zstd_cstream *cstream, + unsigned long long pledged_src_size); + /** - * ZSTD_compressStream() - streaming compress some of input into output - * @zcs: The zstd streaming compression context. - * @output: Destination buffer. `output->pos` is updated to indicate how much - * compressed data was written. - * @input: Source buffer. `input->pos` is updated to indicate how much data was - * read. Note that it may not consume the entire input, in which case - * `input->pos < input->size`, and it's up to the caller to present - * remaining data again. + * zstd_compress_stream() - streaming compress some of input into output + * @cstream: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. + * @input: Source buffer. `input->pos` is updated to indicate how much data + * was read. Note that it may not consume the entire input, in which + * case `input->pos < input->size`, and it's up to the caller to + * present remaining data again. * * The `input` and `output` buffers may be any size. Guaranteed to make some * forward progress if `input` and `output` are not empty. * - * Return: A hint for the number of bytes to use as the input for the next - * function call or an error, which can be checked using - * ZSTD_isError(). + * Return: A hint for the number of bytes to use as the input for the next + * function call or an error, which can be checked using + * zstd_is_error(). */ -size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output, - ZSTD_inBuffer *input); +size_t zstd_compress_stream(zstd_cstream *cstream, zstd_out_buffer *output, + zstd_in_buffer *input); + /** - * ZSTD_flushStream() - flush internal buffers into output - * @zcs: The zstd streaming compression context. - * @output: Destination buffer. `output->pos` is updated to indicate how much - * compressed data was written. + * zstd_flush_stream() - flush internal buffers into output + * @cstream: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. * - * ZSTD_flushStream() must be called until it returns 0, meaning all the data - * has been flushed. Since ZSTD_flushStream() causes a block to be ended, + * zstd_flush_stream() must be called until it returns 0, meaning all the data + * has been flushed. Since zstd_flush_stream() causes a block to be ended, * calling it too often will degrade the compression ratio. * - * Return: The number of bytes still present within internal buffers or an - * error, which can be checked using ZSTD_isError(). - */ -size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output); -/** - * ZSTD_endStream() - flush internal buffers into output and end the frame - * @zcs: The zstd streaming compression context. - * @output: Destination buffer. `output->pos` is updated to indicate how much - * compressed data was written. - * - * ZSTD_endStream() must be called until it returns 0, meaning all the data has - * been flushed and the frame epilogue has been written. - * - * Return: The number of bytes still present within internal buffers or an - * error, which can be checked using ZSTD_isError(). + * Return: The number of bytes still present within internal buffers or an + * error, which can be checked using zstd_is_error(). */ -size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output); +size_t zstd_flush_stream(zstd_cstream *cstream, zstd_out_buffer *output); /** - * ZSTD_CStreamInSize() - recommended size for the input buffer - * - * Return: The recommended size for the input buffer. - */ -size_t ZSTD_CStreamInSize(void); -/** - * ZSTD_CStreamOutSize() - recommended size for the output buffer + * zstd_end_stream() - flush internal buffers into output and end the frame + * @cstream: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. * - * When the output buffer is at least this large, it is guaranteed to be large - * enough to flush at least one complete compressed block. + * zstd_end_stream() must be called until it returns 0, meaning all the data has + * been flushed and the frame epilogue has been written. * - * Return: The recommended size for the output buffer. + * Return: The number of bytes still present within internal buffers or an + * error, which can be checked using zstd_is_error(). */ -size_t ZSTD_CStreamOutSize(void); +size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output); +/* ====== Streaming Decompression ====== */ - -/*-***************************************************************************** - * Streaming decompression - HowTo - * - * A ZSTD_DStream object is required to track streaming operations. - * Use ZSTD_initDStream() to initialize a ZSTD_DStream object. - * ZSTD_DStream objects can be re-used multiple times. - * - * Use ZSTD_decompressStream() repetitively to consume your input. - * The function will update both `pos` fields. - * If `input->pos < input->size`, some input has not been consumed. - * It's up to the caller to present again remaining data. - * If `output->pos < output->size`, decoder has flushed everything it could. - * Returns 0 iff a frame is completely decoded and fully flushed. - * Otherwise it returns a suggested next input size that will never load more - * than the current frame. - ******************************************************************************/ +typedef ZSTD_DStream zstd_dstream; /** - * ZSTD_DStreamWorkspaceBound() - memory needed to initialize a ZSTD_DStream - * @maxWindowSize: The maximum window size allowed for compressed frames. + * zstd_dstream_workspace_bound() - memory needed to initialize a zstd_dstream + * @max_window_size: The maximum window size allowed for compressed frames. * - * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initDStream() and ZSTD_initDStream_usingDDict(). + * Return: A lower bound on the size of the workspace that is passed + * to zstd_init_dstream(). */ -size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize); +size_t zstd_dstream_workspace_bound(size_t max_window_size); /** - * struct ZSTD_DStream - the zstd streaming decompression context - */ -typedef struct ZSTD_DStream_s ZSTD_DStream; -/*===== ZSTD_DStream management functions =====*/ -/** - * ZSTD_initDStream() - initialize a zstd streaming decompression context - * @maxWindowSize: The maximum window size allowed for compressed frames. - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. - * Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine - * how large the workspace must be. - * - * Return: The zstd streaming decompression context. - */ -ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace, - size_t workspaceSize); -/** - * ZSTD_initDStream_usingDDict() - initialize streaming decompression context - * @maxWindowSize: The maximum window size allowed for compressed frames. - * @ddict: The digested dictionary to use for decompression. - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. - * Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine - * how large the workspace must be. + * zstd_init_dstream() - initialize a zstd streaming decompression context + * @max_window_size: The maximum window size allowed for compressed frames. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. + * Use zstd_dstream_workspace_bound(max_window_size) to + * determine how large the workspace must be. * - * Return: The zstd streaming decompression context. + * Return: The zstd streaming decompression context. */ -ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize, - const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize); +zstd_dstream *zstd_init_dstream(size_t max_window_size, void *workspace, + size_t workspace_size); -/*===== Streaming decompression functions =====*/ /** - * ZSTD_resetDStream() - reset the context using parameters from creation - * @zds: The zstd streaming decompression context to reset. + * zstd_reset_dstream() - reset the context using parameters from creation + * @dstream: The zstd streaming decompression context to reset. * * Resets the context using the parameters from creation. Skips dictionary * loading, since it can be reused. * - * Return: Zero or an error, which can be checked using ZSTD_isError(). + * Return: Zero or an error, which can be checked using zstd_is_error(). */ -size_t ZSTD_resetDStream(ZSTD_DStream *zds); +size_t zstd_reset_dstream(zstd_dstream *dstream); + /** - * ZSTD_decompressStream() - streaming decompress some of input into output - * @zds: The zstd streaming decompression context. - * @output: Destination buffer. `output.pos` is updated to indicate how much - * decompressed data was written. - * @input: Source buffer. `input.pos` is updated to indicate how much data was - * read. Note that it may not consume the entire input, in which case - * `input.pos < input.size`, and it's up to the caller to present - * remaining data again. + * zstd_decompress_stream() - streaming decompress some of input into output + * @dstream: The zstd streaming decompression context. + * @output: Destination buffer. `output.pos` is updated to indicate how much + * decompressed data was written. + * @input: Source buffer. `input.pos` is updated to indicate how much data was + * read. Note that it may not consume the entire input, in which case + * `input.pos < input.size`, and it's up to the caller to present + * remaining data again. * * The `input` and `output` buffers may be any size. Guaranteed to make some * forward progress if `input` and `output` are not empty. - * ZSTD_decompressStream() will not consume the last byte of the frame until + * zstd_decompress_stream() will not consume the last byte of the frame until * the entire frame is flushed. * - * Return: Returns 0 iff a frame is completely decoded and fully flushed. - * Otherwise returns a hint for the number of bytes to use as the input - * for the next function call or an error, which can be checked using - * ZSTD_isError(). The size hint will never load more than the frame. - */ -size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output, - ZSTD_inBuffer *input); - -/** - * ZSTD_DStreamInSize() - recommended size for the input buffer - * - * Return: The recommended size for the input buffer. - */ -size_t ZSTD_DStreamInSize(void); -/** - * ZSTD_DStreamOutSize() - recommended size for the output buffer - * - * When the output buffer is at least this large, it is guaranteed to be large - * enough to flush at least one complete decompressed block. - * - * Return: The recommended size for the output buffer. - */ -size_t ZSTD_DStreamOutSize(void); - - -/* --- Constants ---*/ -#define ZSTD_MAGICNUMBER 0xFD2FB528 /* >= v0.8.0 */ -#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50U - -#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) -#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) - -#define ZSTD_WINDOWLOG_MAX_32 27 -#define ZSTD_WINDOWLOG_MAX_64 27 -#define ZSTD_WINDOWLOG_MAX \ - ((unsigned int)(sizeof(size_t) == 4 \ - ? ZSTD_WINDOWLOG_MAX_32 \ - : ZSTD_WINDOWLOG_MAX_64)) -#define ZSTD_WINDOWLOG_MIN 10 -#define ZSTD_HASHLOG_MAX ZSTD_WINDOWLOG_MAX -#define ZSTD_HASHLOG_MIN 6 -#define ZSTD_CHAINLOG_MAX (ZSTD_WINDOWLOG_MAX+1) -#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN -#define ZSTD_HASHLOG3_MAX 17 -#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) -#define ZSTD_SEARCHLOG_MIN 1 -/* only for ZSTD_fast, other strategies are limited to 6 */ -#define ZSTD_SEARCHLENGTH_MAX 7 -/* only for ZSTD_btopt, other strategies are limited to 4 */ -#define ZSTD_SEARCHLENGTH_MIN 3 -#define ZSTD_TARGETLENGTH_MIN 4 -#define ZSTD_TARGETLENGTH_MAX 999 - -/* for static allocation */ -#define ZSTD_FRAMEHEADERSIZE_MAX 18 -#define ZSTD_FRAMEHEADERSIZE_MIN 6 -#define ZSTD_frameHeaderSize_prefix 5 -#define ZSTD_frameHeaderSize_min ZSTD_FRAMEHEADERSIZE_MIN -#define ZSTD_frameHeaderSize_max ZSTD_FRAMEHEADERSIZE_MAX -/* magic number + skippable frame length */ -#define ZSTD_skippableHeaderSize 8 - - -/*-************************************* - * Compressed size functions - **************************************/ - -/** - * ZSTD_findFrameCompressedSize() - returns the size of a compressed frame - * @src: Source buffer. It should point to the start of a zstd encoded frame - * or a skippable frame. - * @srcSize: The size of the source buffer. It must be at least as large as the - * size of the frame. - * - * Return: The compressed size of the frame pointed to by `src` or an error, - * which can be check with ZSTD_isError(). - * Suitable to pass to ZSTD_decompress() or similar functions. - */ -size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize); - -/*-************************************* - * Decompressed size functions - **************************************/ -/** - * ZSTD_getFrameContentSize() - returns the content size in a zstd frame header - * @src: It should point to the start of a zstd encoded frame. - * @srcSize: The size of the source buffer. It must be at least as large as the - * frame header. `ZSTD_frameHeaderSize_max` is always large enough. - * - * Return: The frame content size stored in the frame header if known. - * `ZSTD_CONTENTSIZE_UNKNOWN` if the content size isn't stored in the - * frame header. `ZSTD_CONTENTSIZE_ERROR` on invalid input. - */ -unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); - -/** - * ZSTD_findDecompressedSize() - returns decompressed size of a series of frames - * @src: It should point to the start of a series of zstd encoded and/or - * skippable frames. - * @srcSize: The exact size of the series of frames. - * - * If any zstd encoded frame in the series doesn't have the frame content size - * set, `ZSTD_CONTENTSIZE_UNKNOWN` is returned. But frame content size is always - * set when using ZSTD_compress(). The decompressed size can be very large. - * If the source is untrusted, the decompressed size could be wrong or - * intentionally modified. Always ensure the result fits within the - * application's authorized limits. ZSTD_findDecompressedSize() handles multiple - * frames, and so it must traverse the input to read each frame header. This is - * efficient as most of the data is skipped, however it does mean that all frame - * data must be present and valid. - * - * Return: Decompressed size of all the data contained in the frames if known. - * `ZSTD_CONTENTSIZE_UNKNOWN` if the decompressed size is unknown. - * `ZSTD_CONTENTSIZE_ERROR` if an error occurred. - */ -unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize); - -/*-************************************* - * Advanced compression functions - **************************************/ -/** - * ZSTD_checkCParams() - ensure parameter values remain within authorized range - * @cParams: The zstd compression parameters. - * - * Return: Zero or an error, which can be checked using ZSTD_isError(). - */ -size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams); - -/** - * ZSTD_adjustCParams() - optimize parameters for a given srcSize and dictSize - * @srcSize: Optionally the estimated source size, or zero if unknown. - * @dictSize: Optionally the estimated dictionary size, or zero if unknown. - * - * Return: The optimized parameters. - */ -ZSTD_compressionParameters ZSTD_adjustCParams( - ZSTD_compressionParameters cParams, unsigned long long srcSize, - size_t dictSize); - -/*--- Advanced decompression functions ---*/ - -/** - * ZSTD_isFrame() - returns true iff the buffer starts with a valid frame - * @buffer: The source buffer to check. - * @size: The size of the source buffer, must be at least 4 bytes. - * - * Return: True iff the buffer starts with a zstd or skippable frame identifier. - */ -unsigned int ZSTD_isFrame(const void *buffer, size_t size); - -/** - * ZSTD_getDictID_fromDict() - returns the dictionary id stored in a dictionary - * @dict: The dictionary buffer. - * @dictSize: The size of the dictionary buffer. - * - * Return: The dictionary id stored within the dictionary or 0 if the - * dictionary is not a zstd dictionary. If it returns 0 the - * dictionary can still be loaded as a content-only dictionary. + * Return: Returns 0 iff a frame is completely decoded and fully flushed. + * Otherwise returns a hint for the number of bytes to use as the + * input for the next function call or an error, which can be checked + * using zstd_is_error(). The size hint will never load more than the + * frame. */ -unsigned int ZSTD_getDictID_fromDict(const void *dict, size_t dictSize); +size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output, + zstd_in_buffer *input); -/** - * ZSTD_getDictID_fromDDict() - returns the dictionary id stored in a ZSTD_DDict - * @ddict: The ddict to find the id of. - * - * Return: The dictionary id stored within `ddict` or 0 if the dictionary is not - * a zstd dictionary. If it returns 0 `ddict` will be loaded as a - * content-only dictionary. - */ -unsigned int ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict); +/* ====== Frame Inspection Functions ====== */ /** - * ZSTD_getDictID_fromFrame() - returns the dictionary id stored in a zstd frame - * @src: Source buffer. It must be a zstd encoded frame. - * @srcSize: The size of the source buffer. It must be at least as large as the - * frame header. `ZSTD_frameHeaderSize_max` is always large enough. + * zstd_find_frame_compressed_size() - returns the size of a compressed frame + * @src: Source buffer. It should point to the start of a zstd encoded + * frame or a skippable frame. + * @src_size: The size of the source buffer. It must be at least as large as the + * size of the frame. * - * Return: The dictionary id required to decompress the frame stored within - * `src` or 0 if the dictionary id could not be decoded. It can return - * 0 if the frame does not require a dictionary, the dictionary id - * wasn't stored in the frame, `src` is not a zstd frame, or `srcSize` - * is too small. + * Return: The compressed size of the frame pointed to by `src` or an error, + * which can be check with zstd_is_error(). + * Suitable to pass to ZSTD_decompress() or similar functions. */ -unsigned int ZSTD_getDictID_fromFrame(const void *src, size_t srcSize); +size_t zstd_find_frame_compressed_size(const void *src, size_t src_size); /** - * struct ZSTD_frameParams - zstd frame parameters stored in the frame header + * struct zstd_frame_params - zstd frame parameters stored in the frame header * @frameContentSize: The frame content size, or 0 if not present. * @windowSize: The window size, or 0 if the frame is a skippable frame. * @dictID: The dictionary id, or 0 if not present. * @checksumFlag: Whether a checksum was used. */ -typedef struct { - unsigned long long frameContentSize; - unsigned int windowSize; - unsigned int dictID; - unsigned int checksumFlag; -} ZSTD_frameParams; +typedef ZSTD_frameParams zstd_frame_header; /** - * ZSTD_getFrameParams() - extracts parameters from a zstd or skippable frame - * @fparamsPtr: On success the frame parameters are written here. - * @src: The source buffer. It must point to a zstd or skippable frame. - * @srcSize: The size of the source buffer. `ZSTD_frameHeaderSize_max` is - * always large enough to succeed. + * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame + * @params: On success the frame parameters are written here. + * @src: The source buffer. It must point to a zstd or skippable frame. + * @src_size: The size of the source buffer. * - * Return: 0 on success. If more data is required it returns how many bytes - * must be provided to make forward progress. Otherwise it returns - * an error, which can be checked using ZSTD_isError(). + * Return: 0 on success. If more data is required it returns how many bytes + * must be provided to make forward progress. Otherwise it returns + * an error, which can be checked using zstd_is_error(). */ -size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src, - size_t srcSize); - -/*-***************************************************************************** - * Buffer-less and synchronous inner streaming functions - * - * This is an advanced API, giving full control over buffer management, for - * users which need direct control over memory. - * But it's also a complex one, with many restrictions (documented below). - * Prefer using normal streaming API for an easier experience - ******************************************************************************/ - -/*-***************************************************************************** - * Buffer-less streaming compression (synchronous mode) - * - * A ZSTD_CCtx object is required to track streaming operations. - * Use ZSTD_initCCtx() to initialize a context. - * ZSTD_CCtx object can be re-used multiple times within successive compression - * operations. - * - * Start by initializing a context. - * Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary - * compression, - * or ZSTD_compressBegin_advanced(), for finer parameter control. - * It's also possible to duplicate a reference context which has already been - * initialized, using ZSTD_copyCCtx() - * - * Then, consume your input using ZSTD_compressContinue(). - * There are some important considerations to keep in mind when using this - * advanced function : - * - ZSTD_compressContinue() has no internal buffer. It uses externally provided - * buffer only. - * - Interface is synchronous : input is consumed entirely and produce 1+ - * (or more) compressed blocks. - * - Caller must ensure there is enough space in `dst` to store compressed data - * under worst case scenario. Worst case evaluation is provided by - * ZSTD_compressBound(). - * ZSTD_compressContinue() doesn't guarantee recover after a failed - * compression. - * - ZSTD_compressContinue() presumes prior input ***is still accessible and - * unmodified*** (up to maximum distance size, see WindowLog). - * It remembers all previous contiguous blocks, plus one separated memory - * segment (which can itself consists of multiple contiguous blocks) - * - ZSTD_compressContinue() detects that prior input has been overwritten when - * `src` buffer overlaps. In which case, it will "discard" the relevant memory - * section from its history. - * - * Finish a frame with ZSTD_compressEnd(), which will write the last block(s) - * and optional checksum. It's possible to use srcSize==0, in which case, it - * will write a final empty block to end the frame. Without last block mark, - * frames will be considered unfinished (corrupted) by decoders. - * - * `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new - * frame. - ******************************************************************************/ - -/*===== Buffer-less streaming compression functions =====*/ -size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel); -size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict, - size_t dictSize, int compressionLevel); -size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict, - size_t dictSize, ZSTD_parameters params, - unsigned long long pledgedSrcSize); -size_t ZSTD_copyCCtx(ZSTD_CCtx *cctx, const ZSTD_CCtx *preparedCCtx, - unsigned long long pledgedSrcSize); -size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict, - unsigned long long pledgedSrcSize); -size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); -size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); - - - -/*-***************************************************************************** - * Buffer-less streaming decompression (synchronous mode) - * - * A ZSTD_DCtx object is required to track streaming operations. - * Use ZSTD_initDCtx() to initialize a context. - * A ZSTD_DCtx object can be re-used multiple times. - * - * First typical operation is to retrieve frame parameters, using - * ZSTD_getFrameParams(). It fills a ZSTD_frameParams structure which provide - * important information to correctly decode the frame, such as the minimum - * rolling buffer size to allocate to decompress data (`windowSize`), and the - * dictionary ID used. - * Note: content size is optional, it may not be present. 0 means unknown. - * Note that these values could be wrong, either because of data malformation, - * or because an attacker is spoofing deliberate false information. As a - * consequence, check that values remain within valid application range, - * especially `windowSize`, before allocation. Each application can set its own - * limit, depending on local restrictions. For extended interoperability, it is - * recommended to support at least 8 MB. - * Frame parameters are extracted from the beginning of the compressed frame. - * Data fragment must be large enough to ensure successful decoding, typically - * `ZSTD_frameHeaderSize_max` bytes. - * Result: 0: successful decoding, the `ZSTD_frameParams` structure is filled. - * >0: `srcSize` is too small, provide at least this many bytes. - * errorCode, which can be tested using ZSTD_isError(). - * - * Start decompression, with ZSTD_decompressBegin() or - * ZSTD_decompressBegin_usingDict(). Alternatively, you can copy a prepared - * context, using ZSTD_copyDCtx(). - * - * Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() - * alternatively. - * ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' - * to ZSTD_decompressContinue(). - * ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will - * fail. - * - * The result of ZSTD_decompressContinue() is the number of bytes regenerated - * within 'dst' (necessarily <= dstCapacity). It can be zero, which is not an - * error; it just means ZSTD_decompressContinue() has decoded some metadata - * item. It can also be an error code, which can be tested with ZSTD_isError(). - * - * ZSTD_decompressContinue() needs previous data blocks during decompression, up - * to `windowSize`. They should preferably be located contiguously, prior to - * current block. Alternatively, a round buffer of sufficient size is also - * possible. Sufficient size is determined by frame parameters. - * ZSTD_decompressContinue() is very sensitive to contiguity, if 2 blocks don't - * follow each other, make sure that either the compressor breaks contiguity at - * the same place, or that previous contiguous segment is large enough to - * properly handle maximum back-reference. - * - * A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero. - * Context can then be reset to start a new decompression. - * - * Note: it's possible to know if next input to present is a header or a block, - * using ZSTD_nextInputType(). This information is not required to properly - * decode a frame. - * - * == Special case: skippable frames == - * - * Skippable frames allow integration of user-defined data into a flow of - * concatenated frames. Skippable frames will be ignored (skipped) by a - * decompressor. The format of skippable frames is as follows: - * a) Skippable frame ID - 4 Bytes, Little endian format, any value from - * 0x184D2A50 to 0x184D2A5F - * b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits - * c) Frame Content - any content (User Data) of length equal to Frame Size - * For skippable frames ZSTD_decompressContinue() always returns 0. - * For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0 - * what means that a frame is skippable. - * Note: If fparamsPtr->frameContentSize==0, it is ambiguous: the frame might - * actually be a zstd encoded frame with no content. For purposes of - * decompression, it is valid in both cases to skip the frame using - * ZSTD_findFrameCompressedSize() to find its size in bytes. - * It also returns frame size as fparamsPtr->frameContentSize. - ******************************************************************************/ - -/*===== Buffer-less streaming decompression functions =====*/ -size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx); -size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict, - size_t dictSize); -void ZSTD_copyDCtx(ZSTD_DCtx *dctx, const ZSTD_DCtx *preparedDCtx); -size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx); -size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); -typedef enum { - ZSTDnit_frameHeader, - ZSTDnit_blockHeader, - ZSTDnit_block, - ZSTDnit_lastBlock, - ZSTDnit_checksum, - ZSTDnit_skippableFrame -} ZSTD_nextInputType_e; -ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx); - -/*-***************************************************************************** - * Block functions - * - * Block functions produce and decode raw zstd blocks, without frame metadata. - * Frame metadata cost is typically ~18 bytes, which can be non-negligible for - * very small blocks (< 100 bytes). User will have to take in charge required - * information to regenerate data, such as compressed and content sizes. - * - * A few rules to respect: - * - Compressing and decompressing require a context structure - * + Use ZSTD_initCCtx() and ZSTD_initDCtx() - * - It is necessary to init context before starting - * + compression : ZSTD_compressBegin() - * + decompression : ZSTD_decompressBegin() - * + variants _usingDict() are also allowed - * + copyCCtx() and copyDCtx() work too - * - Block size is limited, it must be <= ZSTD_getBlockSizeMax() - * + If you need to compress more, cut data into multiple blocks - * + Consider using the regular ZSTD_compress() instead, as frame metadata - * costs become negligible when source size is large. - * - When a block is considered not compressible enough, ZSTD_compressBlock() - * result will be zero. In which case, nothing is produced into `dst`. - * + User must test for such outcome and deal directly with uncompressed data - * + ZSTD_decompressBlock() doesn't accept uncompressed data as input!!! - * + In case of multiple successive blocks, decoder must be informed of - * uncompressed block existence to follow proper history. Use - * ZSTD_insertBlock() in such a case. - ******************************************************************************/ - -/* Define for static allocation */ -#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024) -/*===== Raw zstd block functions =====*/ -size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx); -size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); -size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); -size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart, - size_t blockSize); +size_t zstd_get_frame_header(zstd_frame_header *params, const void *src, + size_t src_size); -#endif /* ZSTD_H */ +#endif /* LINUX_ZSTD_H */ diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h new file mode 100644 index 000000000000..13151c34f725 --- /dev/null +++ b/include/linux/zstd_lib.h @@ -0,0 +1,1157 @@ +/* + * Copyright (c) Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of https://github.com/facebook/zstd. + * An additional grant of patent rights can be found in the PATENTS file in the + * same directory. + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License version 2 as published by the + * Free Software Foundation. This program is dual-licensed; you may select + * either version 2 of the GNU General Public License ("GPL") or BSD license + * ("BSD"). + */ + +#ifndef ZSTD_H +#define ZSTD_H + +/* ====== Dependency ======*/ +#include /* size_t */ + + +/*-***************************************************************************** + * Introduction + * + * zstd, short for Zstandard, is a fast lossless compression algorithm, + * targeting real-time compression scenarios at zlib-level and better + * compression ratios. The zstd compression library provides in-memory + * compression and decompression functions. The library supports compression + * levels from 1 up to ZSTD_maxCLevel() which is 22. Levels >= 20, labeled + * ultra, should be used with caution, as they require more memory. + * Compression can be done in: + * - a single step, reusing a context (described as Explicit memory management) + * - unbounded multiple steps (described as Streaming compression) + * The compression ratio achievable on small data can be highly improved using + * compression with a dictionary in: + * - a single step (described as Simple dictionary API) + * - a single step, reusing a dictionary (described as Fast dictionary API) + ******************************************************************************/ + +/*====== Helper functions ======*/ + +/** + * enum ZSTD_ErrorCode - zstd error codes + * + * Functions that return size_t can be checked for errors using ZSTD_isError() + * and the ZSTD_ErrorCode can be extracted using ZSTD_getErrorCode(). + */ +typedef enum { + ZSTD_error_no_error, + ZSTD_error_GENERIC, + ZSTD_error_prefix_unknown, + ZSTD_error_version_unsupported, + ZSTD_error_parameter_unknown, + ZSTD_error_frameParameter_unsupported, + ZSTD_error_frameParameter_unsupportedBy32bits, + ZSTD_error_frameParameter_windowTooLarge, + ZSTD_error_compressionParameter_unsupported, + ZSTD_error_init_missing, + ZSTD_error_memory_allocation, + ZSTD_error_stage_wrong, + ZSTD_error_dstSize_tooSmall, + ZSTD_error_srcSize_wrong, + ZSTD_error_corruption_detected, + ZSTD_error_checksum_wrong, + ZSTD_error_tableLog_tooLarge, + ZSTD_error_maxSymbolValue_tooLarge, + ZSTD_error_maxSymbolValue_tooSmall, + ZSTD_error_dictionary_corrupted, + ZSTD_error_dictionary_wrong, + ZSTD_error_dictionaryCreation_failed, + ZSTD_error_maxCode +} ZSTD_ErrorCode; + +/** + * ZSTD_maxCLevel() - maximum compression level available + * + * Return: Maximum compression level available. + */ +int ZSTD_maxCLevel(void); +/** + * ZSTD_compressBound() - maximum compressed size in worst case scenario + * @srcSize: The size of the data to compress. + * + * Return: The maximum compressed size in the worst case scenario. + */ +size_t ZSTD_compressBound(size_t srcSize); +/** + * ZSTD_isError() - tells if a size_t function result is an error code + * @code: The function result to check for error. + * + * Return: Non-zero iff the code is an error. + */ +static __attribute__((unused)) unsigned int ZSTD_isError(size_t code) +{ + return code > (size_t)-ZSTD_error_maxCode; +} +/** + * ZSTD_getErrorCode() - translates an error function result to a ZSTD_ErrorCode + * @functionResult: The result of a function for which ZSTD_isError() is true. + * + * Return: The ZSTD_ErrorCode corresponding to the functionResult or 0 + * if the functionResult isn't an error. + */ +static __attribute__((unused)) ZSTD_ErrorCode ZSTD_getErrorCode( + size_t functionResult) +{ + if (!ZSTD_isError(functionResult)) + return (ZSTD_ErrorCode)0; + return (ZSTD_ErrorCode)(0 - functionResult); +} + +/** + * enum ZSTD_strategy - zstd compression search strategy + * + * From faster to stronger. + */ +typedef enum { + ZSTD_fast, + ZSTD_dfast, + ZSTD_greedy, + ZSTD_lazy, + ZSTD_lazy2, + ZSTD_btlazy2, + ZSTD_btopt, + ZSTD_btopt2 +} ZSTD_strategy; + +/** + * struct ZSTD_compressionParameters - zstd compression parameters + * @windowLog: Log of the largest match distance. Larger means more + * compression, and more memory needed during decompression. + * @chainLog: Fully searched segment. Larger means more compression, slower, + * and more memory (useless for fast). + * @hashLog: Dispatch table. Larger means more compression, + * slower, and more memory. + * @searchLog: Number of searches. Larger means more compression and slower. + * @searchLength: Match length searched. Larger means faster decompression, + * sometimes less compression. + * @targetLength: Acceptable match size for optimal parser (only). Larger means + * more compression, and slower. + * @strategy: The zstd compression strategy. + */ +typedef struct { + unsigned int windowLog; + unsigned int chainLog; + unsigned int hashLog; + unsigned int searchLog; + unsigned int searchLength; + unsigned int targetLength; + ZSTD_strategy strategy; +} ZSTD_compressionParameters; + +/** + * struct ZSTD_frameParameters - zstd frame parameters + * @contentSizeFlag: Controls whether content size will be present in the frame + * header (when known). + * @checksumFlag: Controls whether a 32-bit checksum is generated at the end + * of the frame for error detection. + * @noDictIDFlag: Controls whether dictID will be saved into the frame header + * when using dictionary compression. + * + * The default value is all fields set to 0. + */ +typedef struct { + unsigned int contentSizeFlag; + unsigned int checksumFlag; + unsigned int noDictIDFlag; +} ZSTD_frameParameters; + +/** + * struct ZSTD_parameters - zstd parameters + * @cParams: The compression parameters. + * @fParams: The frame parameters. + */ +typedef struct { + ZSTD_compressionParameters cParams; + ZSTD_frameParameters fParams; +} ZSTD_parameters; + +/** + * ZSTD_getCParams() - returns ZSTD_compressionParameters for selected level + * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel(). + * @estimatedSrcSize: The estimated source size to compress or 0 if unknown. + * @dictSize: The dictionary size or 0 if a dictionary isn't being used. + * + * Return: The selected ZSTD_compressionParameters. + */ +ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, + unsigned long long estimatedSrcSize, size_t dictSize); + +/** + * ZSTD_getParams() - returns ZSTD_parameters for selected level + * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel(). + * @estimatedSrcSize: The estimated source size to compress or 0 if unknown. + * @dictSize: The dictionary size or 0 if a dictionary isn't being used. + * + * The same as ZSTD_getCParams() except also selects the default frame + * parameters (all zero). + * + * Return: The selected ZSTD_parameters. + */ +ZSTD_parameters ZSTD_getParams(int compressionLevel, + unsigned long long estimatedSrcSize, size_t dictSize); + +/*-************************************* + * Explicit memory management + **************************************/ + +/** + * ZSTD_CCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_CCtx + * @cParams: The compression parameters to be used for compression. + * + * If multiple compression parameters might be used, the caller must call + * ZSTD_CCtxWorkspaceBound() for each set of parameters and use the maximum + * size. + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initCCtx(). + */ +size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams); + +/** + * struct ZSTD_CCtx - the zstd compression context + * + * When compressing many times it is recommended to allocate a context just once + * and reuse it for each successive compression operation. + */ +typedef struct ZSTD_CCtx_s ZSTD_CCtx; +/** + * ZSTD_initCCtx() - initialize a zstd compression context + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. Use ZSTD_CCtxWorkspaceBound() to + * determine how large the workspace must be. + * + * Return: A compression context emplaced into workspace. + */ +ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize); + +/** + * ZSTD_compressCCtx() - compress src into dst + * @ctx: The context. Must have been initialized with a workspace at + * least as large as ZSTD_CCtxWorkspaceBound(params.cParams). + * @dst: The buffer to compress src into. + * @dstCapacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @srcSize: The size of the data to compress. + * @params: The parameters to use for compression. See ZSTD_getParams(). + * + * Return: The compressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize, ZSTD_parameters params); + +/** + * ZSTD_DCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_DCtx + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initDCtx(). + */ +size_t ZSTD_DCtxWorkspaceBound(void); + +/** + * struct ZSTD_DCtx - the zstd decompression context + * + * When decompressing many times it is recommended to allocate a context just + * once and reuse it for each successive decompression operation. + */ +typedef struct ZSTD_DCtx_s ZSTD_DCtx; +/** + * ZSTD_initDCtx() - initialize a zstd decompression context + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. Use ZSTD_DCtxWorkspaceBound() to + * determine how large the workspace must be. + * + * Return: A decompression context emplaced into workspace. + */ +ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize); + +/** + * ZSTD_decompressDCtx() - decompress zstd compressed src into dst + * @ctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dstCapacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @srcSize: The exact size of the data to decompress. + * + * Return: The decompressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_decompressDCtx(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); + +/*-************************ + * Simple dictionary API + **************************/ + +/** + * ZSTD_compress_usingDict() - compress src into dst using a dictionary + * @ctx: The context. Must have been initialized with a workspace at + * least as large as ZSTD_CCtxWorkspaceBound(params.cParams). + * @dst: The buffer to compress src into. + * @dstCapacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @srcSize: The size of the data to compress. + * @dict: The dictionary to use for compression. + * @dictSize: The size of the dictionary. + * @params: The parameters to use for compression. See ZSTD_getParams(). + * + * Compression using a predefined dictionary. The same dictionary must be used + * during decompression. + * + * Return: The compressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize, const void *dict, size_t dictSize, + ZSTD_parameters params); + +/** + * ZSTD_decompress_usingDict() - decompress src into dst using a dictionary + * @ctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dstCapacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @srcSize: The exact size of the data to decompress. + * @dict: The dictionary to use for decompression. The same dictionary + * must've been used to compress the data. + * @dictSize: The size of the dictionary. + * + * Return: The decompressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_decompress_usingDict(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize, const void *dict, size_t dictSize); + +/*-************************** + * Fast dictionary API + ***************************/ + +/** + * ZSTD_CDictWorkspaceBound() - memory needed to initialize a ZSTD_CDict + * @cParams: The compression parameters to be used for compression. + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initCDict(). + */ +size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams); + +/** + * struct ZSTD_CDict - a digested dictionary to be used for compression + */ +typedef struct ZSTD_CDict_s ZSTD_CDict; + +/** + * ZSTD_initCDict() - initialize a digested dictionary for compression + * @dictBuffer: The dictionary to digest. The buffer is referenced by the + * ZSTD_CDict so it must outlive the returned ZSTD_CDict. + * @dictSize: The size of the dictionary. + * @params: The parameters to use for compression. See ZSTD_getParams(). + * @workspace: The workspace. It must outlive the returned ZSTD_CDict. + * @workspaceSize: The workspace size. Must be at least + * ZSTD_CDictWorkspaceBound(params.cParams). + * + * When compressing multiple messages / blocks with the same dictionary it is + * recommended to load it just once. The ZSTD_CDict merely references the + * dictBuffer, so it must outlive the returned ZSTD_CDict. + * + * Return: The digested dictionary emplaced into workspace. + */ +ZSTD_CDict *ZSTD_initCDict(const void *dictBuffer, size_t dictSize, + ZSTD_parameters params, void *workspace, size_t workspaceSize); + +/** + * ZSTD_compress_usingCDict() - compress src into dst using a ZSTD_CDict + * @ctx: The context. Must have been initialized with a workspace at + * least as large as ZSTD_CCtxWorkspaceBound(cParams) where + * cParams are the compression parameters used to initialize the + * cdict. + * @dst: The buffer to compress src into. + * @dstCapacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @srcSize: The size of the data to compress. + * @cdict: The digested dictionary to use for compression. + * @params: The parameters to use for compression. See ZSTD_getParams(). + * + * Compression using a digested dictionary. The same dictionary must be used + * during decompression. + * + * Return: The compressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize, const ZSTD_CDict *cdict); + + +/** + * ZSTD_DDictWorkspaceBound() - memory needed to initialize a ZSTD_DDict + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initDDict(). + */ +size_t ZSTD_DDictWorkspaceBound(void); + +/** + * struct ZSTD_DDict - a digested dictionary to be used for decompression + */ +typedef struct ZSTD_DDict_s ZSTD_DDict; + +/** + * ZSTD_initDDict() - initialize a digested dictionary for decompression + * @dictBuffer: The dictionary to digest. The buffer is referenced by the + * ZSTD_DDict so it must outlive the returned ZSTD_DDict. + * @dictSize: The size of the dictionary. + * @workspace: The workspace. It must outlive the returned ZSTD_DDict. + * @workspaceSize: The workspace size. Must be at least + * ZSTD_DDictWorkspaceBound(). + * + * When decompressing multiple messages / blocks with the same dictionary it is + * recommended to load it just once. The ZSTD_DDict merely references the + * dictBuffer, so it must outlive the returned ZSTD_DDict. + * + * Return: The digested dictionary emplaced into workspace. + */ +ZSTD_DDict *ZSTD_initDDict(const void *dictBuffer, size_t dictSize, + void *workspace, size_t workspaceSize); + +/** + * ZSTD_decompress_usingDDict() - decompress src into dst using a ZSTD_DDict + * @ctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dstCapacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @srcSize: The exact size of the data to decompress. + * @ddict: The digested dictionary to use for decompression. The same + * dictionary must've been used to compress the data. + * + * Return: The decompressed size or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst, + size_t dstCapacity, const void *src, size_t srcSize, + const ZSTD_DDict *ddict); + + +/*-************************** + * Streaming + ***************************/ + +/** + * struct ZSTD_inBuffer - input buffer for streaming + * @src: Start of the input buffer. + * @size: Size of the input buffer. + * @pos: Position where reading stopped. Will be updated. + * Necessarily 0 <= pos <= size. + */ +typedef struct ZSTD_inBuffer_s { + const void *src; + size_t size; + size_t pos; +} ZSTD_inBuffer; + +/** + * struct ZSTD_outBuffer - output buffer for streaming + * @dst: Start of the output buffer. + * @size: Size of the output buffer. + * @pos: Position where writing stopped. Will be updated. + * Necessarily 0 <= pos <= size. + */ +typedef struct ZSTD_outBuffer_s { + void *dst; + size_t size; + size_t pos; +} ZSTD_outBuffer; + + + +/*-***************************************************************************** + * Streaming compression - HowTo + * + * A ZSTD_CStream object is required to track streaming operation. + * Use ZSTD_initCStream() to initialize a ZSTD_CStream object. + * ZSTD_CStream objects can be reused multiple times on consecutive compression + * operations. It is recommended to re-use ZSTD_CStream in situations where many + * streaming operations will be achieved consecutively. Use one separate + * ZSTD_CStream per thread for parallel execution. + * + * Use ZSTD_compressStream() repetitively to consume input stream. + * The function will automatically update both `pos` fields. + * Note that it may not consume the entire input, in which case `pos < size`, + * and it's up to the caller to present again remaining data. + * It returns a hint for the preferred number of bytes to use as an input for + * the next function call. + * + * At any moment, it's possible to flush whatever data remains within internal + * buffer, using ZSTD_flushStream(). `output->pos` will be updated. There might + * still be some content left within the internal buffer if `output->size` is + * too small. It returns the number of bytes left in the internal buffer and + * must be called until it returns 0. + * + * ZSTD_endStream() instructs to finish a frame. It will perform a flush and + * write frame epilogue. The epilogue is required for decoders to consider a + * frame completed. Similar to ZSTD_flushStream(), it may not be able to flush + * the full content if `output->size` is too small. In which case, call again + * ZSTD_endStream() to complete the flush. It returns the number of bytes left + * in the internal buffer and must be called until it returns 0. + ******************************************************************************/ + +/** + * ZSTD_CStreamWorkspaceBound() - memory needed to initialize a ZSTD_CStream + * @cParams: The compression parameters to be used for compression. + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initCStream() and ZSTD_initCStream_usingCDict(). + */ +size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams); + +/** + * struct ZSTD_CStream - the zstd streaming compression context + */ +typedef struct ZSTD_CStream_s ZSTD_CStream; + +/*===== ZSTD_CStream management functions =====*/ +/** + * ZSTD_initCStream() - initialize a zstd streaming compression context + * @params: The zstd compression parameters. + * @pledgedSrcSize: If params.fParams.contentSizeFlag == 1 then the caller must + * pass the source size (zero means empty source). Otherwise, + * the caller may optionally pass the source size, or zero if + * unknown. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. + * Use ZSTD_CStreamWorkspaceBound(params.cParams) to determine + * how large the workspace must be. + * + * Return: The zstd streaming compression context. + */ +ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params, + unsigned long long pledgedSrcSize, void *workspace, + size_t workspaceSize); + +/** + * ZSTD_initCStream_usingCDict() - initialize a streaming compression context + * @cdict: The digested dictionary to use for compression. + * @pledgedSrcSize: Optionally the source size, or zero if unknown. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. Call ZSTD_CStreamWorkspaceBound() + * with the cParams used to initialize the cdict to determine + * how large the workspace must be. + * + * Return: The zstd streaming compression context. + */ +ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict, + unsigned long long pledgedSrcSize, void *workspace, + size_t workspaceSize); + +/*===== Streaming compression functions =====*/ +/** + * ZSTD_resetCStream() - reset the context using parameters from creation + * @zcs: The zstd streaming compression context to reset. + * @pledgedSrcSize: Optionally the source size, or zero if unknown. + * + * Resets the context using the parameters from creation. Skips dictionary + * loading, since it can be reused. If `pledgedSrcSize` is non-zero the frame + * content size is always written into the frame header. + * + * Return: Zero or an error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize); +/** + * ZSTD_compressStream() - streaming compress some of input into output + * @zcs: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. + * @input: Source buffer. `input->pos` is updated to indicate how much data was + * read. Note that it may not consume the entire input, in which case + * `input->pos < input->size`, and it's up to the caller to present + * remaining data again. + * + * The `input` and `output` buffers may be any size. Guaranteed to make some + * forward progress if `input` and `output` are not empty. + * + * Return: A hint for the number of bytes to use as the input for the next + * function call or an error, which can be checked using + * ZSTD_isError(). + */ +size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output, + ZSTD_inBuffer *input); +/** + * ZSTD_flushStream() - flush internal buffers into output + * @zcs: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. + * + * ZSTD_flushStream() must be called until it returns 0, meaning all the data + * has been flushed. Since ZSTD_flushStream() causes a block to be ended, + * calling it too often will degrade the compression ratio. + * + * Return: The number of bytes still present within internal buffers or an + * error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output); +/** + * ZSTD_endStream() - flush internal buffers into output and end the frame + * @zcs: The zstd streaming compression context. + * @output: Destination buffer. `output->pos` is updated to indicate how much + * compressed data was written. + * + * ZSTD_endStream() must be called until it returns 0, meaning all the data has + * been flushed and the frame epilogue has been written. + * + * Return: The number of bytes still present within internal buffers or an + * error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output); + +/** + * ZSTD_CStreamInSize() - recommended size for the input buffer + * + * Return: The recommended size for the input buffer. + */ +size_t ZSTD_CStreamInSize(void); +/** + * ZSTD_CStreamOutSize() - recommended size for the output buffer + * + * When the output buffer is at least this large, it is guaranteed to be large + * enough to flush at least one complete compressed block. + * + * Return: The recommended size for the output buffer. + */ +size_t ZSTD_CStreamOutSize(void); + + + +/*-***************************************************************************** + * Streaming decompression - HowTo + * + * A ZSTD_DStream object is required to track streaming operations. + * Use ZSTD_initDStream() to initialize a ZSTD_DStream object. + * ZSTD_DStream objects can be re-used multiple times. + * + * Use ZSTD_decompressStream() repetitively to consume your input. + * The function will update both `pos` fields. + * If `input->pos < input->size`, some input has not been consumed. + * It's up to the caller to present again remaining data. + * If `output->pos < output->size`, decoder has flushed everything it could. + * Returns 0 iff a frame is completely decoded and fully flushed. + * Otherwise it returns a suggested next input size that will never load more + * than the current frame. + ******************************************************************************/ + +/** + * ZSTD_DStreamWorkspaceBound() - memory needed to initialize a ZSTD_DStream + * @maxWindowSize: The maximum window size allowed for compressed frames. + * + * Return: A lower bound on the size of the workspace that is passed to + * ZSTD_initDStream() and ZSTD_initDStream_usingDDict(). + */ +size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize); + +/** + * struct ZSTD_DStream - the zstd streaming decompression context + */ +typedef struct ZSTD_DStream_s ZSTD_DStream; +/*===== ZSTD_DStream management functions =====*/ +/** + * ZSTD_initDStream() - initialize a zstd streaming decompression context + * @maxWindowSize: The maximum window size allowed for compressed frames. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. + * Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine + * how large the workspace must be. + * + * Return: The zstd streaming decompression context. + */ +ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace, + size_t workspaceSize); +/** + * ZSTD_initDStream_usingDDict() - initialize streaming decompression context + * @maxWindowSize: The maximum window size allowed for compressed frames. + * @ddict: The digested dictionary to use for decompression. + * @workspace: The workspace to emplace the context into. It must outlive + * the returned context. + * @workspaceSize: The size of workspace. + * Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine + * how large the workspace must be. + * + * Return: The zstd streaming decompression context. + */ +ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize, + const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize); + +/*===== Streaming decompression functions =====*/ +/** + * ZSTD_resetDStream() - reset the context using parameters from creation + * @zds: The zstd streaming decompression context to reset. + * + * Resets the context using the parameters from creation. Skips dictionary + * loading, since it can be reused. + * + * Return: Zero or an error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_resetDStream(ZSTD_DStream *zds); +/** + * ZSTD_decompressStream() - streaming decompress some of input into output + * @zds: The zstd streaming decompression context. + * @output: Destination buffer. `output.pos` is updated to indicate how much + * decompressed data was written. + * @input: Source buffer. `input.pos` is updated to indicate how much data was + * read. Note that it may not consume the entire input, in which case + * `input.pos < input.size`, and it's up to the caller to present + * remaining data again. + * + * The `input` and `output` buffers may be any size. Guaranteed to make some + * forward progress if `input` and `output` are not empty. + * ZSTD_decompressStream() will not consume the last byte of the frame until + * the entire frame is flushed. + * + * Return: Returns 0 iff a frame is completely decoded and fully flushed. + * Otherwise returns a hint for the number of bytes to use as the input + * for the next function call or an error, which can be checked using + * ZSTD_isError(). The size hint will never load more than the frame. + */ +size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output, + ZSTD_inBuffer *input); + +/** + * ZSTD_DStreamInSize() - recommended size for the input buffer + * + * Return: The recommended size for the input buffer. + */ +size_t ZSTD_DStreamInSize(void); +/** + * ZSTD_DStreamOutSize() - recommended size for the output buffer + * + * When the output buffer is at least this large, it is guaranteed to be large + * enough to flush at least one complete decompressed block. + * + * Return: The recommended size for the output buffer. + */ +size_t ZSTD_DStreamOutSize(void); + + +/* --- Constants ---*/ +#define ZSTD_MAGICNUMBER 0xFD2FB528 /* >= v0.8.0 */ +#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50U + +#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) +#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) + +#define ZSTD_WINDOWLOG_MAX_32 27 +#define ZSTD_WINDOWLOG_MAX_64 27 +#define ZSTD_WINDOWLOG_MAX \ + ((unsigned int)(sizeof(size_t) == 4 \ + ? ZSTD_WINDOWLOG_MAX_32 \ + : ZSTD_WINDOWLOG_MAX_64)) +#define ZSTD_WINDOWLOG_MIN 10 +#define ZSTD_HASHLOG_MAX ZSTD_WINDOWLOG_MAX +#define ZSTD_HASHLOG_MIN 6 +#define ZSTD_CHAINLOG_MAX (ZSTD_WINDOWLOG_MAX+1) +#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN +#define ZSTD_HASHLOG3_MAX 17 +#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) +#define ZSTD_SEARCHLOG_MIN 1 +/* only for ZSTD_fast, other strategies are limited to 6 */ +#define ZSTD_SEARCHLENGTH_MAX 7 +/* only for ZSTD_btopt, other strategies are limited to 4 */ +#define ZSTD_SEARCHLENGTH_MIN 3 +#define ZSTD_TARGETLENGTH_MIN 4 +#define ZSTD_TARGETLENGTH_MAX 999 + +/* for static allocation */ +#define ZSTD_FRAMEHEADERSIZE_MAX 18 +#define ZSTD_FRAMEHEADERSIZE_MIN 6 +#define ZSTD_frameHeaderSize_prefix 5 +#define ZSTD_frameHeaderSize_min ZSTD_FRAMEHEADERSIZE_MIN +#define ZSTD_frameHeaderSize_max ZSTD_FRAMEHEADERSIZE_MAX +/* magic number + skippable frame length */ +#define ZSTD_skippableHeaderSize 8 + + +/*-************************************* + * Compressed size functions + **************************************/ + +/** + * ZSTD_findFrameCompressedSize() - returns the size of a compressed frame + * @src: Source buffer. It should point to the start of a zstd encoded frame + * or a skippable frame. + * @srcSize: The size of the source buffer. It must be at least as large as the + * size of the frame. + * + * Return: The compressed size of the frame pointed to by `src` or an error, + * which can be check with ZSTD_isError(). + * Suitable to pass to ZSTD_decompress() or similar functions. + */ +size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize); + +/*-************************************* + * Decompressed size functions + **************************************/ +/** + * ZSTD_getFrameContentSize() - returns the content size in a zstd frame header + * @src: It should point to the start of a zstd encoded frame. + * @srcSize: The size of the source buffer. It must be at least as large as the + * frame header. `ZSTD_frameHeaderSize_max` is always large enough. + * + * Return: The frame content size stored in the frame header if known. + * `ZSTD_CONTENTSIZE_UNKNOWN` if the content size isn't stored in the + * frame header. `ZSTD_CONTENTSIZE_ERROR` on invalid input. + */ +unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + +/** + * ZSTD_findDecompressedSize() - returns decompressed size of a series of frames + * @src: It should point to the start of a series of zstd encoded and/or + * skippable frames. + * @srcSize: The exact size of the series of frames. + * + * If any zstd encoded frame in the series doesn't have the frame content size + * set, `ZSTD_CONTENTSIZE_UNKNOWN` is returned. But frame content size is always + * set when using ZSTD_compress(). The decompressed size can be very large. + * If the source is untrusted, the decompressed size could be wrong or + * intentionally modified. Always ensure the result fits within the + * application's authorized limits. ZSTD_findDecompressedSize() handles multiple + * frames, and so it must traverse the input to read each frame header. This is + * efficient as most of the data is skipped, however it does mean that all frame + * data must be present and valid. + * + * Return: Decompressed size of all the data contained in the frames if known. + * `ZSTD_CONTENTSIZE_UNKNOWN` if the decompressed size is unknown. + * `ZSTD_CONTENTSIZE_ERROR` if an error occurred. + */ +unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize); + +/*-************************************* + * Advanced compression functions + **************************************/ +/** + * ZSTD_checkCParams() - ensure parameter values remain within authorized range + * @cParams: The zstd compression parameters. + * + * Return: Zero or an error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams); + +/** + * ZSTD_adjustCParams() - optimize parameters for a given srcSize and dictSize + * @srcSize: Optionally the estimated source size, or zero if unknown. + * @dictSize: Optionally the estimated dictionary size, or zero if unknown. + * + * Return: The optimized parameters. + */ +ZSTD_compressionParameters ZSTD_adjustCParams( + ZSTD_compressionParameters cParams, unsigned long long srcSize, + size_t dictSize); + +/*--- Advanced decompression functions ---*/ + +/** + * ZSTD_isFrame() - returns true iff the buffer starts with a valid frame + * @buffer: The source buffer to check. + * @size: The size of the source buffer, must be at least 4 bytes. + * + * Return: True iff the buffer starts with a zstd or skippable frame identifier. + */ +unsigned int ZSTD_isFrame(const void *buffer, size_t size); + +/** + * ZSTD_getDictID_fromDict() - returns the dictionary id stored in a dictionary + * @dict: The dictionary buffer. + * @dictSize: The size of the dictionary buffer. + * + * Return: The dictionary id stored within the dictionary or 0 if the + * dictionary is not a zstd dictionary. If it returns 0 the + * dictionary can still be loaded as a content-only dictionary. + */ +unsigned int ZSTD_getDictID_fromDict(const void *dict, size_t dictSize); + +/** + * ZSTD_getDictID_fromDDict() - returns the dictionary id stored in a ZSTD_DDict + * @ddict: The ddict to find the id of. + * + * Return: The dictionary id stored within `ddict` or 0 if the dictionary is not + * a zstd dictionary. If it returns 0 `ddict` will be loaded as a + * content-only dictionary. + */ +unsigned int ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict); + +/** + * ZSTD_getDictID_fromFrame() - returns the dictionary id stored in a zstd frame + * @src: Source buffer. It must be a zstd encoded frame. + * @srcSize: The size of the source buffer. It must be at least as large as the + * frame header. `ZSTD_frameHeaderSize_max` is always large enough. + * + * Return: The dictionary id required to decompress the frame stored within + * `src` or 0 if the dictionary id could not be decoded. It can return + * 0 if the frame does not require a dictionary, the dictionary id + * wasn't stored in the frame, `src` is not a zstd frame, or `srcSize` + * is too small. + */ +unsigned int ZSTD_getDictID_fromFrame(const void *src, size_t srcSize); + +/** + * struct ZSTD_frameParams - zstd frame parameters stored in the frame header + * @frameContentSize: The frame content size, or 0 if not present. + * @windowSize: The window size, or 0 if the frame is a skippable frame. + * @dictID: The dictionary id, or 0 if not present. + * @checksumFlag: Whether a checksum was used. + */ +typedef struct { + unsigned long long frameContentSize; + unsigned int windowSize; + unsigned int dictID; + unsigned int checksumFlag; +} ZSTD_frameParams; + +/** + * ZSTD_getFrameParams() - extracts parameters from a zstd or skippable frame + * @fparamsPtr: On success the frame parameters are written here. + * @src: The source buffer. It must point to a zstd or skippable frame. + * @srcSize: The size of the source buffer. `ZSTD_frameHeaderSize_max` is + * always large enough to succeed. + * + * Return: 0 on success. If more data is required it returns how many bytes + * must be provided to make forward progress. Otherwise it returns + * an error, which can be checked using ZSTD_isError(). + */ +size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src, + size_t srcSize); + +/*-***************************************************************************** + * Buffer-less and synchronous inner streaming functions + * + * This is an advanced API, giving full control over buffer management, for + * users which need direct control over memory. + * But it's also a complex one, with many restrictions (documented below). + * Prefer using normal streaming API for an easier experience + ******************************************************************************/ + +/*-***************************************************************************** + * Buffer-less streaming compression (synchronous mode) + * + * A ZSTD_CCtx object is required to track streaming operations. + * Use ZSTD_initCCtx() to initialize a context. + * ZSTD_CCtx object can be re-used multiple times within successive compression + * operations. + * + * Start by initializing a context. + * Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary + * compression, + * or ZSTD_compressBegin_advanced(), for finer parameter control. + * It's also possible to duplicate a reference context which has already been + * initialized, using ZSTD_copyCCtx() + * + * Then, consume your input using ZSTD_compressContinue(). + * There are some important considerations to keep in mind when using this + * advanced function : + * - ZSTD_compressContinue() has no internal buffer. It uses externally provided + * buffer only. + * - Interface is synchronous : input is consumed entirely and produce 1+ + * (or more) compressed blocks. + * - Caller must ensure there is enough space in `dst` to store compressed data + * under worst case scenario. Worst case evaluation is provided by + * ZSTD_compressBound(). + * ZSTD_compressContinue() doesn't guarantee recover after a failed + * compression. + * - ZSTD_compressContinue() presumes prior input ***is still accessible and + * unmodified*** (up to maximum distance size, see WindowLog). + * It remembers all previous contiguous blocks, plus one separated memory + * segment (which can itself consists of multiple contiguous blocks) + * - ZSTD_compressContinue() detects that prior input has been overwritten when + * `src` buffer overlaps. In which case, it will "discard" the relevant memory + * section from its history. + * + * Finish a frame with ZSTD_compressEnd(), which will write the last block(s) + * and optional checksum. It's possible to use srcSize==0, in which case, it + * will write a final empty block to end the frame. Without last block mark, + * frames will be considered unfinished (corrupted) by decoders. + * + * `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new + * frame. + ******************************************************************************/ + +/*===== Buffer-less streaming compression functions =====*/ +size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel); +size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict, + size_t dictSize, int compressionLevel); +size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict, + size_t dictSize, ZSTD_parameters params, + unsigned long long pledgedSrcSize); +size_t ZSTD_copyCCtx(ZSTD_CCtx *cctx, const ZSTD_CCtx *preparedCCtx, + unsigned long long pledgedSrcSize); +size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict, + unsigned long long pledgedSrcSize); +size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); +size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); + + + +/*-***************************************************************************** + * Buffer-less streaming decompression (synchronous mode) + * + * A ZSTD_DCtx object is required to track streaming operations. + * Use ZSTD_initDCtx() to initialize a context. + * A ZSTD_DCtx object can be re-used multiple times. + * + * First typical operation is to retrieve frame parameters, using + * ZSTD_getFrameParams(). It fills a ZSTD_frameParams structure which provide + * important information to correctly decode the frame, such as the minimum + * rolling buffer size to allocate to decompress data (`windowSize`), and the + * dictionary ID used. + * Note: content size is optional, it may not be present. 0 means unknown. + * Note that these values could be wrong, either because of data malformation, + * or because an attacker is spoofing deliberate false information. As a + * consequence, check that values remain within valid application range, + * especially `windowSize`, before allocation. Each application can set its own + * limit, depending on local restrictions. For extended interoperability, it is + * recommended to support at least 8 MB. + * Frame parameters are extracted from the beginning of the compressed frame. + * Data fragment must be large enough to ensure successful decoding, typically + * `ZSTD_frameHeaderSize_max` bytes. + * Result: 0: successful decoding, the `ZSTD_frameParams` structure is filled. + * >0: `srcSize` is too small, provide at least this many bytes. + * errorCode, which can be tested using ZSTD_isError(). + * + * Start decompression, with ZSTD_decompressBegin() or + * ZSTD_decompressBegin_usingDict(). Alternatively, you can copy a prepared + * context, using ZSTD_copyDCtx(). + * + * Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() + * alternatively. + * ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' + * to ZSTD_decompressContinue(). + * ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will + * fail. + * + * The result of ZSTD_decompressContinue() is the number of bytes regenerated + * within 'dst' (necessarily <= dstCapacity). It can be zero, which is not an + * error; it just means ZSTD_decompressContinue() has decoded some metadata + * item. It can also be an error code, which can be tested with ZSTD_isError(). + * + * ZSTD_decompressContinue() needs previous data blocks during decompression, up + * to `windowSize`. They should preferably be located contiguously, prior to + * current block. Alternatively, a round buffer of sufficient size is also + * possible. Sufficient size is determined by frame parameters. + * ZSTD_decompressContinue() is very sensitive to contiguity, if 2 blocks don't + * follow each other, make sure that either the compressor breaks contiguity at + * the same place, or that previous contiguous segment is large enough to + * properly handle maximum back-reference. + * + * A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero. + * Context can then be reset to start a new decompression. + * + * Note: it's possible to know if next input to present is a header or a block, + * using ZSTD_nextInputType(). This information is not required to properly + * decode a frame. + * + * == Special case: skippable frames == + * + * Skippable frames allow integration of user-defined data into a flow of + * concatenated frames. Skippable frames will be ignored (skipped) by a + * decompressor. The format of skippable frames is as follows: + * a) Skippable frame ID - 4 Bytes, Little endian format, any value from + * 0x184D2A50 to 0x184D2A5F + * b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits + * c) Frame Content - any content (User Data) of length equal to Frame Size + * For skippable frames ZSTD_decompressContinue() always returns 0. + * For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0 + * what means that a frame is skippable. + * Note: If fparamsPtr->frameContentSize==0, it is ambiguous: the frame might + * actually be a zstd encoded frame with no content. For purposes of + * decompression, it is valid in both cases to skip the frame using + * ZSTD_findFrameCompressedSize() to find its size in bytes. + * It also returns frame size as fparamsPtr->frameContentSize. + ******************************************************************************/ + +/*===== Buffer-less streaming decompression functions =====*/ +size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx); +size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict, + size_t dictSize); +void ZSTD_copyDCtx(ZSTD_DCtx *dctx, const ZSTD_DCtx *preparedDCtx); +size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx); +size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); +typedef enum { + ZSTDnit_frameHeader, + ZSTDnit_blockHeader, + ZSTDnit_block, + ZSTDnit_lastBlock, + ZSTDnit_checksum, + ZSTDnit_skippableFrame +} ZSTD_nextInputType_e; +ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx); + +/*-***************************************************************************** + * Block functions + * + * Block functions produce and decode raw zstd blocks, without frame metadata. + * Frame metadata cost is typically ~18 bytes, which can be non-negligible for + * very small blocks (< 100 bytes). User will have to take in charge required + * information to regenerate data, such as compressed and content sizes. + * + * A few rules to respect: + * - Compressing and decompressing require a context structure + * + Use ZSTD_initCCtx() and ZSTD_initDCtx() + * - It is necessary to init context before starting + * + compression : ZSTD_compressBegin() + * + decompression : ZSTD_decompressBegin() + * + variants _usingDict() are also allowed + * + copyCCtx() and copyDCtx() work too + * - Block size is limited, it must be <= ZSTD_getBlockSizeMax() + * + If you need to compress more, cut data into multiple blocks + * + Consider using the regular ZSTD_compress() instead, as frame metadata + * costs become negligible when source size is large. + * - When a block is considered not compressible enough, ZSTD_compressBlock() + * result will be zero. In which case, nothing is produced into `dst`. + * + User must test for such outcome and deal directly with uncompressed data + * + ZSTD_decompressBlock() doesn't accept uncompressed data as input!!! + * + In case of multiple successive blocks, decoder must be informed of + * uncompressed block existence to follow proper history. Use + * ZSTD_insertBlock() in such a case. + ******************************************************************************/ + +/* Define for static allocation */ +#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024) +/*===== Raw zstd block functions =====*/ +size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx); +size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); +size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, + const void *src, size_t srcSize); +size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart, + size_t blockSize); + +#endif /* ZSTD_H */ -- cgit v1.2.3 From e0c1b49f5b674cca7b10549c53b3791d0bbc90a8 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Fri, 11 Sep 2020 16:37:08 -0700 Subject: lib: zstd: Upgrade to latest upstream zstd version 1.4.10 Upgrade to the latest upstream zstd version 1.4.10. This patch is 100% generated from upstream zstd commit 20821a46f412 [0]. This patch is very large because it is transitioning from the custom kernel zstd to using upstream directly. The new zstd follows upstreams file structure which is different. Future update patches will be much smaller because they will only contain the changes from one upstream zstd release. As an aid for review I've created a commit [1] that shows the diff between upstream zstd as-is (which doesn't compile), and the zstd code imported in this patch. The verion of zstd in this patch is generated from upstream with changes applied by automation to replace upstreams libc dependencies, remove unnecessary portability macros, replace `/**` comments with `/*` comments, and use the kernel's xxhash instead of bundling it. The benefits of this patch are as follows: 1. Using upstream directly with automated script to generate kernel code. This allows us to update the kernel every upstream release, so the kernel gets the latest bug fixes and performance improvements, and doesn't get 3 years out of date again. The automation and the translated code are tested every upstream commit to ensure it continues to work. 2. Upgrades from a custom zstd based on 1.3.1 to 1.4.10, getting 3 years of performance improvements and bug fixes. On x86_64 I've measured 15% faster BtrFS and SquashFS decompression+read speeds, 35% faster kernel decompression, and 30% faster ZRAM decompression+read speeds. 3. Zstd-1.4.10 supports negative compression levels, which allow zstd to match or subsume lzo's performance. 4. Maintains the same kernel-specific wrapper API, so no callers have to be modified with zstd version updates. One concern that was brought up was stack usage. Upstream zstd had already removed most of its heavy stack usage functions, but I just removed the last functions that allocate arrays on the stack. I've measured the high water mark for both compression and decompression before and after this patch. Decompression is approximately neutral, using about 1.2KB of stack space. Compression levels up to 3 regressed from 1.4KB -> 1.6KB, and higher compression levels regressed from 1.5KB -> 2KB. We've added unit tests upstream to prevent further regression. I believe that this is a reasonable increase, and if it does end up causing problems, this commit can be cleanly reverted, because it only touches zstd. I chose the bulk update instead of replaying upstream commits because there have been ~3500 upstream commits since the 1.3.1 release, zstd wasn't ready to be used in the kernel as-is before a month ago, and not all upstream zstd commits build. The bulk update preserves bisectablity because bugs can be bisected to the zstd version update. At that point the update can be reverted, and we can work with upstream to find and fix the bug. Note that upstream zstd release 1.4.10 doesn't exist yet. I have cut a staging branch at 20821a46f412 [0] and will apply any changes requested to the staging branch. Once we're ready to merge this update I will cut a zstd release at the commit we merge, so we have a known zstd release in the kernel. The implementation of the kernel API is contained in zstd_compress_module.c and zstd_decompress_module.c. [0] https://github.com/facebook/zstd/commit/20821a46f4122f9abd7c7b245d28162dde8129c9 [1] https://github.com/terrelln/linux/commit/e0fa481d0e3df26918da0a13749740a1f6777574 Signed-off-by: Nick Terrell Tested By: Paul Jones Tested-by: Oleksandr Natalenko Tested-by: Sedat Dilek # LLVM/Clang v13.0.0 on x86-64 Tested-by: Jean-Denis Girard --- include/linux/zstd.h | 13 +- include/linux/zstd_errors.h | 77 + include/linux/zstd_lib.h | 3367 +++++++++++++++++++++++++++++-------------- 3 files changed, 2408 insertions(+), 1049 deletions(-) create mode 100644 include/linux/zstd_errors.h (limited to 'include/linux') diff --git a/include/linux/zstd.h b/include/linux/zstd.h index 9fbc7729b0a0..113408eef6ec 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ /* * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. @@ -22,6 +22,7 @@ /* ====== Dependency ====== */ #include +#include #include /* ====== Helper Functions ====== */ @@ -417,12 +418,18 @@ size_t zstd_find_frame_compressed_size(const void *src, size_t src_size); /** * struct zstd_frame_params - zstd frame parameters stored in the frame header - * @frameContentSize: The frame content size, or 0 if not present. + * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not + * present. * @windowSize: The window size, or 0 if the frame is a skippable frame. + * @blockSizeMax: The maximum block size. + * @frameType: The frame type (zstd or skippable) + * @headerSize: The size of the frame header. * @dictID: The dictionary id, or 0 if not present. * @checksumFlag: Whether a checksum was used. + * + * See zstd_lib.h. */ -typedef ZSTD_frameParams zstd_frame_header; +typedef ZSTD_frameHeader zstd_frame_header; /** * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h new file mode 100644 index 000000000000..58b6dd45a969 --- /dev/null +++ b/include/linux/zstd_errors.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_ERRORS_H_398273423 +#define ZSTD_ERRORS_H_398273423 + + +/*===== dependency =====*/ +#include /* size_t */ + + +/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +#define ZSTDERRORLIB_VISIBILITY +#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY + +/*-********************************************* + * Error codes list + *-********************************************* + * Error codes _values_ are pinned down since v1.3.1 only. + * Therefore, don't rely on values if you may link to any version < v1.3.1. + * + * Only values < 100 are considered stable. + * + * note 1 : this API shall be used with static linking only. + * dynamic linking is not yet officially supported. + * note 2 : Prefer relying on the enum than on its value whenever possible + * This is the only supported way to use the error list < v1.3.1 + * note 3 : ZSTD_isError() is always correct, whatever the library version. + **********************************************/ +typedef enum { + ZSTD_error_no_error = 0, + ZSTD_error_GENERIC = 1, + ZSTD_error_prefix_unknown = 10, + ZSTD_error_version_unsupported = 12, + ZSTD_error_frameParameter_unsupported = 14, + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, + ZSTD_error_workSpace_tooSmall= 66, + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_srcBuffer_wrong = 105, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ +} ZSTD_ErrorCode; + +/*! ZSTD_getErrorCode() : + convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, + which can be used to compare with enum list published above */ +ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); +ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ + + + +#endif /* ZSTD_ERRORS_H_398273423 */ diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h index 13151c34f725..b8c7dbf98390 100644 --- a/include/linux/zstd_lib.h +++ b/include/linux/zstd_lib.h @@ -2,1156 +2,2431 @@ * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of https://github.com/facebook/zstd. - * An additional grant of patent rights can be found in the PATENTS file in the - * same directory. - * - * This program is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License version 2 as published by the - * Free Software Foundation. This program is dual-licensed; you may select - * either version 2 of the GNU General Public License ("GPL") or BSD license - * ("BSD"). + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. */ -#ifndef ZSTD_H -#define ZSTD_H +#ifndef ZSTD_H_235446 +#define ZSTD_H_235446 /* ====== Dependency ======*/ +#include /* INT_MAX */ #include /* size_t */ -/*-***************************************************************************** - * Introduction - * - * zstd, short for Zstandard, is a fast lossless compression algorithm, - * targeting real-time compression scenarios at zlib-level and better - * compression ratios. The zstd compression library provides in-memory - * compression and decompression functions. The library supports compression - * levels from 1 up to ZSTD_maxCLevel() which is 22. Levels >= 20, labeled - * ultra, should be used with caution, as they require more memory. - * Compression can be done in: - * - a single step, reusing a context (described as Explicit memory management) - * - unbounded multiple steps (described as Streaming compression) - * The compression ratio achievable on small data can be highly improved using - * compression with a dictionary in: - * - a single step (described as Simple dictionary API) - * - a single step, reusing a dictionary (described as Fast dictionary API) - ******************************************************************************/ +/* ===== ZSTDLIB_API : control library symbols visibility ===== */ +#define ZSTDLIB_VISIBILITY +#define ZSTDLIB_API ZSTDLIB_VISIBILITY + + +/* ***************************************************************************** + Introduction + + zstd, short for Zstandard, is a fast lossless compression algorithm, targeting + real-time compression scenarios at zlib-level and better compression ratios. + The zstd compression library provides in-memory compression and decompression + functions. + + The library supports regular compression levels from 1 up to ZSTD_maxCLevel(), + which is currently 22. Levels >= 20, labeled `--ultra`, should be used with + caution, as they require more memory. The library also offers negative + compression levels, which extend the range of speed vs. ratio preferences. + The lower the level, the faster the speed (at the cost of compression). + + Compression can be done in: + - a single step (described as Simple API) + - a single step, reusing a context (described as Explicit context) + - unbounded multiple steps (described as Streaming compression) + + The compression ratio achievable on small data can be highly improved using + a dictionary. Dictionary compression can be performed in: + - a single step (described as Simple dictionary API) + - a single step, reusing a dictionary (described as Bulk-processing + dictionary API) + + Advanced experimental functions can be accessed using + `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h. + + Advanced experimental APIs should never be used with a dynamically-linked + library. They are not "stable"; their definitions or signatures may change in + the future. Only static linking is allowed. +*******************************************************************************/ + +/*------ Version ------*/ +#define ZSTD_VERSION_MAJOR 1 +#define ZSTD_VERSION_MINOR 4 +#define ZSTD_VERSION_RELEASE 10 +#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) + +/*! ZSTD_versionNumber() : + * Return runtime library version, the value is (MAJOR*100*100 + MINOR*100 + RELEASE). */ +ZSTDLIB_API unsigned ZSTD_versionNumber(void); + +#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE +#define ZSTD_QUOTE(str) #str +#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str) +#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION) + +/*! ZSTD_versionString() : + * Return runtime library version, like "1.4.5". Requires v1.3.0+. */ +ZSTDLIB_API const char* ZSTD_versionString(void); + +/* ************************************* + * Default constant + ***************************************/ +#ifndef ZSTD_CLEVEL_DEFAULT +# define ZSTD_CLEVEL_DEFAULT 3 +#endif + +/* ************************************* + * Constants + ***************************************/ + +/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */ +#define ZSTD_MAGICNUMBER 0xFD2FB528 /* valid since v0.8.0 */ +#define ZSTD_MAGIC_DICTIONARY 0xEC30A437 /* valid since v0.7.0 */ +#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50 /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */ +#define ZSTD_MAGIC_SKIPPABLE_MASK 0xFFFFFFF0 + +#define ZSTD_BLOCKSIZELOG_MAX 17 +#define ZSTD_BLOCKSIZE_MAX (1<= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*! ZSTD_decompress() : + * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. + * `dstCapacity` is an upper bound of originalSize to regenerate. + * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. + * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + +/*! ZSTD_getFrameContentSize() : requires v1.3.0+ + * `src` should point to the start of a ZSTD encoded frame. + * `srcSize` must be at least as large as the frame header. + * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. + * @return : - decompressed size of `src` frame content, if known + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) + * note 1 : a 0 return value means the frame is valid but "empty". + * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * Optionally, application can rely on some implicit limit, + * as ZSTD_decompress() only needs an upper bound of decompressed size. + * (For example, data could be necessarily cut into blocks <= 16 KB). + * note 3 : decompressed size is always present when compression is completed using single-pass functions, + * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). + * note 4 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure return value fits within application's authorized limits. + * Each application can set its own limits. + * note 6 : This function replaces ZSTD_getDecompressedSize() */ +#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) +#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) +ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + +/*! ZSTD_getDecompressedSize() : + * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). + * Both functions work the same way, but ZSTD_getDecompressedSize() blends + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_findFrameCompressedSize() : + * `src` should point to the start of a ZSTD frame or skippable frame. + * `srcSize` must be >= first frame size + * @return : the compressed size of the first frame starting at `src`, + * suitable to pass as `srcSize` to `ZSTD_decompress` or similar, + * or an error code if input is invalid */ +ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); + /*====== Helper functions ======*/ +#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ +ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ +ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */ +ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ + + +/* ************************************* +* Explicit context +***************************************/ +/*= Compression context + * When compressing many times, + * it is recommended to allocate a context just once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Note : re-using context is just a speed / resource optimization. + * It doesn't change the compression ratio, which remains identical. + * Note 2 : In multi-threaded environments, + * use one different context per thread for parallel execution. + */ +typedef struct ZSTD_CCtx_s ZSTD_CCtx; +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); +ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* accept NULL pointer */ + +/*! ZSTD_compressCCtx() : + * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. + * Important : in order to behave similarly to `ZSTD_compress()`, + * this function compresses at requested compression level, + * __ignoring any other parameter__ . + * If any advanced parameter was set using the advanced API, + * they will all be reset. Only `compressionLevel` remains. + */ +ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ +typedef struct ZSTD_DCtx_s ZSTD_DCtx; +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void); +ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /* accept NULL pointer */ + +/*! ZSTD_decompressDCtx() : + * Same as ZSTD_decompress(), + * requires an allocated ZSTD_DCtx. + * Compatible with sticky parameters. + */ +ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); -/** - * enum ZSTD_ErrorCode - zstd error codes + +/* ************************************* +* Advanced compression API +***************************************/ + +/* API design : + * Parameters are pushed one by one into an existing context, + * using ZSTD_CCtx_set*() functions. + * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. + * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! + * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . + * + * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). * - * Functions that return size_t can be checked for errors using ZSTD_isError() - * and the ZSTD_ErrorCode can be extracted using ZSTD_getErrorCode(). + * This API supercedes all other "advanced" API entry points in the experimental section. + * In the future, we expect to remove from experimental API entry points which are redundant with this API. + */ + + +/* Compression strategies, listed from fastest to strongest */ +typedef enum { ZSTD_fast=1, + ZSTD_dfast=2, + ZSTD_greedy=3, + ZSTD_lazy=4, + ZSTD_lazy2=5, + ZSTD_btlazy2=6, + ZSTD_btopt=7, + ZSTD_btultra=8, + ZSTD_btultra2=9 + /* note : new strategies _might_ be added in the future. + Only the order (from fast to strong) is guaranteed */ +} ZSTD_strategy; + + +typedef enum { + + /* compression parameters + * Note: When compressing with a ZSTD_CDict these parameters are superseded + * by the parameters used to construct the ZSTD_CDict. + * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */ + ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table. + * Note that exact compression parameters are dynamically determined, + * depending on both compression level and srcSize (when known). + * Default level is ZSTD_CLEVEL_DEFAULT==3. + * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT. + * Note 1 : it's possible to pass a negative compression level. + * Note 2 : setting a level does not automatically set all other compression parameters + * to default. Setting this will however eventually dynamically impact the compression + * parameters which have not been manually set. The manually set + * ones will 'stick'. */ + /* Advanced compression parameters : + * It's possible to pin down compression parameters to some specific values. + * In which case, these values are no longer dynamically selected by the compressor */ + ZSTD_c_windowLog=101, /* Maximum allowed back-reference distance, expressed as power of 2. + * This will set a memory budget for streaming decompression, + * with larger values requiring more memory + * and typically compressing more. + * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX. + * Special: value 0 means "use default windowLog". + * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT + * requires explicitly allowing such size at streaming decompression stage. */ + ZSTD_c_hashLog=102, /* Size of the initial probe table, as a power of 2. + * Resulting memory usage is (1 << (hashLog+2)). + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX. + * Larger tables improve compression ratio of strategies <= dFast, + * and improve speed of strategies > dFast. + * Special: value 0 means "use default hashLog". */ + ZSTD_c_chainLog=103, /* Size of the multi-probe search table, as a power of 2. + * Resulting memory usage is (1 << (chainLog+2)). + * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX. + * Larger tables result in better and slower compression. + * This parameter is useless for "fast" strategy. + * It's still useful when using "dfast" strategy, + * in which case it defines a secondary probe table. + * Special: value 0 means "use default chainLog". */ + ZSTD_c_searchLog=104, /* Number of search attempts, as a power of 2. + * More attempts result in better and slower compression. + * This parameter is useless for "fast" and "dFast" strategies. + * Special: value 0 means "use default searchLog". */ + ZSTD_c_minMatch=105, /* Minimum size of searched matches. + * Note that Zstandard can still find matches of smaller size, + * it just tweaks its search algorithm to look for this size and larger. + * Larger values increase compression and decompression speed, but decrease ratio. + * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX. + * Note that currently, for all strategies < btopt, effective minimum is 4. + * , for all strategies > fast, effective maximum is 6. + * Special: value 0 means "use default minMatchLength". */ + ZSTD_c_targetLength=106, /* Impact of this field depends on strategy. + * For strategies btopt, btultra & btultra2: + * Length of Match considered "good enough" to stop search. + * Larger values make compression stronger, and slower. + * For strategy fast: + * Distance between match sampling. + * Larger values make compression faster, and weaker. + * Special: value 0 means "use default targetLength". */ + ZSTD_c_strategy=107, /* See ZSTD_strategy enum definition. + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ + + /* LDM mode parameters */ + ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio + * for large inputs, by finding large matches at long distance. + * It increases memory usage and window size. + * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB + * except when expressly set to a different value. + * Note: will be enabled by default if ZSTD_c_windowLog >= 128 MB and + * compression strategy >= ZSTD_btopt (== compression level 16+) */ + ZSTD_c_ldmHashLog=161, /* Size of the table for long distance matching, as a power of 2. + * Larger values increase memory usage and compression ratio, + * but decrease compression speed. + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX + * default: windowlog - 7. + * Special: value 0 means "automatically determine hashlog". */ + ZSTD_c_ldmMinMatch=162, /* Minimum match size for long distance matcher. + * Larger/too small values usually decrease compression ratio. + * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX. + * Special: value 0 means "use default value" (default: 64). */ + ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution. + * Larger values improve collision resolution but decrease compression speed. + * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX. + * Special: value 0 means "use default value" (default: 3). */ + ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table. + * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN). + * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage. + * Larger values improve compression speed. + * Deviating far from default value will likely result in a compression ratio decrease. + * Special: value 0 means "automatically determine hashRateLog". */ + + /* frame parameters */ + ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1) + * Content size must be known at the beginning of compression. + * This is automatically the case when using ZSTD_compress2(), + * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */ + ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */ + ZSTD_c_dictIDFlag=202, /* When applicable, dictionary's ID is written into frame header (default:1) */ + + /* multi-threading parameters */ + /* These parameters are only active if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD). + * Otherwise, trying to set any other value than default (0) will be a no-op and return an error. + * In a situation where it's unknown if the linked library supports multi-threading or not, + * setting ZSTD_c_nbWorkers to any value >= 1 and consulting the return value provides a quick way to check this property. + */ + ZSTD_c_nbWorkers=400, /* Select how many threads will be spawned to compress in parallel. + * When nbWorkers >= 1, triggers asynchronous mode when invoking ZSTD_compressStream*() : + * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller, + * while compression is performed in parallel, within worker thread(s). + * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end : + * in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call). + * More workers improve speed, but also increase memory usage. + * Default value is `0`, aka "single-threaded mode" : no worker is spawned, + * compression is performed inside Caller's thread, and all invocations are blocking */ + ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1. + * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads. + * 0 means default, which is dynamically determined based on compression parameters. + * Job size must be a minimum of overlap size, or 1 MB, whichever is largest. + * The minimum size is automatically and transparently enforced. */ + ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size. + * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. + * It helps preserve compression ratio, while each job is compressed in parallel. + * This value is enforced only when nbWorkers >= 1. + * Larger values increase compression ratio, but decrease speed. + * Possible values range from 0 to 9 : + * - 0 means "default" : value will be determined by the library, depending on strategy + * - 1 means "no overlap" + * - 9 means "full overlap", using a full window size. + * Each intermediate rank increases/decreases load size by a factor 2 : + * 9: full window; 8: w/2; 7: w/4; 6: w/8; 5:w/16; 4: w/32; 3:w/64; 2:w/128; 1:no overlap; 0:default + * default value varies between 6 and 9, depending on strategy */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_c_rsyncable + * ZSTD_c_format + * ZSTD_c_forceMaxWindow + * ZSTD_c_forceAttachDict + * ZSTD_c_literalCompressionMode + * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint + * ZSTD_c_enableDedicatedDictSearch + * ZSTD_c_stableInBuffer + * ZSTD_c_stableOutBuffer + * ZSTD_c_blockDelimiters + * ZSTD_c_validateSequences + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. + */ + ZSTD_c_experimentalParam1=500, + ZSTD_c_experimentalParam2=10, + ZSTD_c_experimentalParam3=1000, + ZSTD_c_experimentalParam4=1001, + ZSTD_c_experimentalParam5=1002, + ZSTD_c_experimentalParam6=1003, + ZSTD_c_experimentalParam7=1004, + ZSTD_c_experimentalParam8=1005, + ZSTD_c_experimentalParam9=1006, + ZSTD_c_experimentalParam10=1007, + ZSTD_c_experimentalParam11=1008, + ZSTD_c_experimentalParam12=1009 +} ZSTD_cParameter; + +typedef struct { + size_t error; + int lowerBound; + int upperBound; +} ZSTD_bounds; + +/*! ZSTD_cParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - lower and upper bounds, both inclusive + */ +ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam); + +/*! ZSTD_CCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_cParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is generally only possible during frame initialization (before starting compression). + * Exception : when using multi-threading mode (nbWorkers >= 1), + * the following parameters can be updated _during_ compression (within same frame): + * => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy. + * new parameters will be active for next job only (after a flush()). + * @return : an error code (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value); + +/*! ZSTD_CCtx_setPledgedSrcSize() : + * Total input data size to be compressed as a single frame. + * Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag. + * This value will also be controlled at end of frame, and trigger an error if not respected. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame. + * In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN. + * ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame. + * Note 2 : pledgedSrcSize is only valid once, for the next frame. + * It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN. + * Note 3 : Whenever all input data is provided and consumed in a single round, + * for example with ZSTD_compress2(), + * or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end), + * this value is automatically overridden by srcSize instead. + */ +ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize); + typedef enum { - ZSTD_error_no_error, - ZSTD_error_GENERIC, - ZSTD_error_prefix_unknown, - ZSTD_error_version_unsupported, - ZSTD_error_parameter_unknown, - ZSTD_error_frameParameter_unsupported, - ZSTD_error_frameParameter_unsupportedBy32bits, - ZSTD_error_frameParameter_windowTooLarge, - ZSTD_error_compressionParameter_unsupported, - ZSTD_error_init_missing, - ZSTD_error_memory_allocation, - ZSTD_error_stage_wrong, - ZSTD_error_dstSize_tooSmall, - ZSTD_error_srcSize_wrong, - ZSTD_error_corruption_detected, - ZSTD_error_checksum_wrong, - ZSTD_error_tableLog_tooLarge, - ZSTD_error_maxSymbolValue_tooLarge, - ZSTD_error_maxSymbolValue_tooSmall, - ZSTD_error_dictionary_corrupted, - ZSTD_error_dictionary_wrong, - ZSTD_error_dictionaryCreation_failed, - ZSTD_error_maxCode -} ZSTD_ErrorCode; - -/** - * ZSTD_maxCLevel() - maximum compression level available - * - * Return: Maximum compression level available. - */ -int ZSTD_maxCLevel(void); -/** - * ZSTD_compressBound() - maximum compressed size in worst case scenario - * @srcSize: The size of the data to compress. - * - * Return: The maximum compressed size in the worst case scenario. - */ -size_t ZSTD_compressBound(size_t srcSize); -/** - * ZSTD_isError() - tells if a size_t function result is an error code - * @code: The function result to check for error. - * - * Return: Non-zero iff the code is an error. - */ -static __attribute__((unused)) unsigned int ZSTD_isError(size_t code) -{ - return code > (size_t)-ZSTD_error_maxCode; -} -/** - * ZSTD_getErrorCode() - translates an error function result to a ZSTD_ErrorCode - * @functionResult: The result of a function for which ZSTD_isError() is true. - * - * Return: The ZSTD_ErrorCode corresponding to the functionResult or 0 - * if the functionResult isn't an error. - */ -static __attribute__((unused)) ZSTD_ErrorCode ZSTD_getErrorCode( - size_t functionResult) -{ - if (!ZSTD_isError(functionResult)) - return (ZSTD_ErrorCode)0; - return (ZSTD_ErrorCode)(0 - functionResult); -} - -/** - * enum ZSTD_strategy - zstd compression search strategy - * - * From faster to stronger. + ZSTD_reset_session_only = 1, + ZSTD_reset_parameters = 2, + ZSTD_reset_session_and_parameters = 3 +} ZSTD_ResetDirective; + +/*! ZSTD_CCtx_reset() : + * There are 2 different things that can be reset, independently or jointly : + * - The session : will stop compressing current frame, and make CCtx ready to start a new one. + * Useful after an error, or to interrupt any ongoing compression. + * Any internal data not yet flushed is cancelled. + * Compression parameters and dictionary remain unchanged. + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". + * This removes any reference to any dictionary too. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. */ +ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + +/*! ZSTD_compress2() : + * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. + * ZSTD_compress2() always starts a new frame. + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. + * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + + +/* ************************************* +* Advanced decompression API +***************************************/ + +/* The advanced API pushes parameters one by one into an existing DCtx context. + * Parameters are sticky, and remain valid for all following frames + * using the same DCtx context. + * It's possible to reset parameters to default values using ZSTD_DCtx_reset(). + * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream(). + * Therefore, no new decompression function is necessary. + */ + typedef enum { - ZSTD_fast, - ZSTD_dfast, - ZSTD_greedy, - ZSTD_lazy, - ZSTD_lazy2, - ZSTD_btlazy2, - ZSTD_btopt, - ZSTD_btopt2 -} ZSTD_strategy; -/** - * struct ZSTD_compressionParameters - zstd compression parameters - * @windowLog: Log of the largest match distance. Larger means more - * compression, and more memory needed during decompression. - * @chainLog: Fully searched segment. Larger means more compression, slower, - * and more memory (useless for fast). - * @hashLog: Dispatch table. Larger means more compression, - * slower, and more memory. - * @searchLog: Number of searches. Larger means more compression and slower. - * @searchLength: Match length searched. Larger means faster decompression, - * sometimes less compression. - * @targetLength: Acceptable match size for optimal parser (only). Larger means - * more compression, and slower. - * @strategy: The zstd compression strategy. + ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which + * the streaming API will refuse to allocate memory buffer + * in order to protect the host from unreasonable memory requirements. + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT). + * Special: value 0 means "use default maximum windowLog". */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_d_format + * ZSTD_d_stableOutBuffer + * ZSTD_d_forceIgnoreChecksum + * ZSTD_d_refMultipleDDicts + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001, + ZSTD_d_experimentalParam3=1002, + ZSTD_d_experimentalParam4=1003 + +} ZSTD_dParameter; + +/*! ZSTD_dParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - both lower and upper bounds, inclusive */ -typedef struct { - unsigned int windowLog; - unsigned int chainLog; - unsigned int hashLog; - unsigned int searchLog; - unsigned int searchLength; - unsigned int targetLength; - ZSTD_strategy strategy; -} ZSTD_compressionParameters; +ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam); + +/*! ZSTD_DCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_dParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is only possible during frame initialization (before starting decompression). + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value); + +/*! ZSTD_DCtx_reset() : + * Return a DCtx to clean state. + * Session and parameters can be reset jointly or separately. + * Parameters can only be reset when no active frame is being decompressed. + * @return : 0, or an error code, which can be tested with ZSTD_isError() + */ +ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset); + + +/* ************************** +* Streaming +****************************/ + +typedef struct ZSTD_inBuffer_s { + const void* src; /*< start of input buffer */ + size_t size; /*< size of input buffer */ + size_t pos; /*< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_inBuffer; + +typedef struct ZSTD_outBuffer_s { + void* dst; /*< start of output buffer */ + size_t size; /*< size of output buffer */ + size_t pos; /*< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_outBuffer; + -/** - * struct ZSTD_frameParameters - zstd frame parameters - * @contentSizeFlag: Controls whether content size will be present in the frame - * header (when known). - * @checksumFlag: Controls whether a 32-bit checksum is generated at the end - * of the frame for error detection. - * @noDictIDFlag: Controls whether dictID will be saved into the frame header - * when using dictionary compression. + +/*-*********************************************************************** +* Streaming compression - HowTo +* +* A ZSTD_CStream object is required to track streaming operation. +* Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. +* ZSTD_CStream objects can be reused multiple times on consecutive compression operations. +* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. +* +* For parallel execution, use one separate ZSTD_CStream per thread. +* +* note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. +* +* Parameters are sticky : when starting a new compression on the same context, +* it will re-use the same sticky parameters as previous compression session. +* When in doubt, it's recommended to fully initialize the context before usage. +* Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), +* ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to +* set more specific parameters, the pledged source size, or load a dictionary. +* +* Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to +* consume input stream. The function will automatically update both `pos` +* fields within `input` and `output`. +* Note that the function may not consume the entire input, for example, because +* the output buffer is already full, in which case `input.pos < input.size`. +* The caller must check if input has been entirely consumed. +* If not, the caller must make some room to receive more compressed data, +* and then present again remaining input data. +* note: ZSTD_e_continue is guaranteed to make some forward progress when called, +* but doesn't guarantee maximal forward progress. This is especially relevant +* when compressing with multiple threads. The call won't block if it can +* consume some input, but if it can't it will wait for some, but not all, +* output to be flushed. +* @return : provides a minimum amount of data remaining to be flushed from internal buffers +* or an error code, which can be tested using ZSTD_isError(). +* +* At any moment, it's possible to flush whatever data might remain stuck within internal buffer, +* using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated. +* Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0). +* In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the +* operation. +* note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if internal buffers are entirely flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame. +* It will perform a flush and write frame epilogue. +* The epilogue is required for decoders to consider a frame completed. +* flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to +* start a new frame. +* note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if frame fully completed and fully flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* *******************************************************************/ + +typedef ZSTD_CCtx ZSTD_CStream; /*< CCtx and CStream are now effectively same object (>= v1.3.0) */ + /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */ +/*===== ZSTD_CStream management functions =====*/ +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void); +ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); /* accept NULL pointer */ + +/*===== Streaming compression functions =====*/ +typedef enum { + ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */ + ZSTD_e_flush=1, /* flush any data provided so far, + * it creates (at least) one new block, that can be decoded immediately on reception; + * frame will continue: any future data can still reference previously compressed data, improving compression. + * note : multithreaded compression will block to flush as much output as possible. */ + ZSTD_e_end=2 /* flush any remaining data _and_ close current frame. + * note that frame is only closed after compressed data is fully flushed (return value == 0). + * After that point, any additional data starts a new frame. + * note : each frame is independent (does not reference any content from previous frame). + : note : multithreaded compression will block to flush as much output as possible. */ +} ZSTD_EndDirective; + +/*! ZSTD_compressStream2() : + * Behaves about the same as ZSTD_compressStream, with additional control on end directive. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode) + * - output->pos must be <= dstCapacity, input->pos must be <= srcSize + * - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit. + * - endOp must be a valid directive + * - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller. + * - When nbWorkers>=1, function is non-blocking : it copies a portion of input, distributes jobs to internal worker threads, flush to output whatever is available, + * and then immediately returns, just indicating that there is some data remaining to be flushed. + * The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte. + * - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking. + * - @return provides a minimum amount of data remaining to be flushed from internal buffers + * or an error code, which can be tested using ZSTD_isError(). + * if @return != 0, flush is not fully completed, there is still some data left within internal buffers. + * This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers. + * For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed. + * - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0), + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. + */ +ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp); + + +/* These buffer sizes are softly recommended. + * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output. + * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(), + * reducing the amount of memory shuffling and buffering, resulting in minor performance savings. + * + * However, note that these recommendations are from the perspective of a C caller program. + * If the streaming interface is invoked from some other language, + * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo, + * a major performance rule is to reduce crossing such interface to an absolute minimum. + * It's not rare that performance ends being spent more into the interface, rather than compression itself. + * In which cases, prefer using large buffers, as large as practical, + * for both input and output, to reduce the nb of roundtrips. + */ +ZSTDLIB_API size_t ZSTD_CStreamInSize(void); /*< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */ + + +/* ***************************************************************************** + * This following is a legacy streaming API. + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. + * Advanced parameters and dictionary compression can only be used through the + * new API. + ******************************************************************************/ + +/*! + * Equivalent to: * - * The default value is all fields set to 0. + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + */ +ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); +/*! + * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue). + * NOTE: The return value is different. ZSTD_compressStream() returns a hint for + * the next read size (if non-zero and not an error). ZSTD_compressStream2() + * returns the minimum nb of bytes left to flush (if non-zero and not an error). */ +ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */ +ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */ +ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + + +/*-*************************************************************************** +* Streaming decompression - HowTo +* +* A ZSTD_DStream object is required to track streaming operations. +* Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. +* ZSTD_DStream objects can be re-used multiple times. +* +* Use ZSTD_initDStream() to start a new decompression operation. +* @return : recommended first input size +* Alternatively, use advanced API to set specific properties. +* +* Use ZSTD_decompressStream() repetitively to consume your input. +* The function will update both `pos` fields. +* If `input.pos < input.size`, some input has not been consumed. +* It's up to the caller to present again remaining data. +* The function tries to flush all data decoded immediately, respecting output buffer size. +* If `output.pos < output.size`, decoder has flushed everything it could. +* But if `output.pos == output.size`, there might be some data left within internal buffers., +* In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer. +* Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX. +* @return : 0 when a frame is completely decoded and fully flushed, +* or an error code, which can be tested using ZSTD_isError(), +* or any other value > 0, which means there is still some decoding or flushing to do to complete current frame : +* the return value is a suggested next input size (just a hint for better latency) +* that will never request more than the remaining frame size. +* *******************************************************************************/ + +typedef ZSTD_DCtx ZSTD_DStream; /*< DCtx and DStream are now effectively same object (>= v1.3.0) */ + /* For compatibility with versions <= v1.2.0, prefer differentiating them. */ +/*===== ZSTD_DStream management functions =====*/ +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void); +ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer */ + +/*===== Streaming decompression functions =====*/ + +/* This function is redundant with the advanced API and equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ +ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + +ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + +ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */ + + +/* ************************ +* Simple dictionary API +***************************/ +/*! ZSTD_compress_usingDict() : + * Compression at an explicit compression level using a Dictionary. + * A dictionary can be any arbitrary data segment (also called a prefix), + * or a buffer with specified information (see dictBuilder/zdict.h). + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + int compressionLevel); + +/*! ZSTD_decompress_usingDict() : + * Decompression using a known Dictionary. + * Dictionary must be identical to the one used during compression. + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + + +/* ********************************* + * Bulk processing dictionary API + **********************************/ +typedef struct ZSTD_CDict_s ZSTD_CDict; + +/*! ZSTD_createCDict() : + * When compressing multiple messages or blocks using the same dictionary, + * it's recommended to digest the dictionary only once, since it's a costly operation. + * ZSTD_createCDict() will create a state from digesting a dictionary. + * The resulting state can be used for future compression operations with very limited startup cost. + * ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. + * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict. + * Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content. + * Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer, + * in which case the only thing that it transports is the @compressionLevel. + * This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively, + * expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize, + int compressionLevel); + +/*! ZSTD_freeCDict() : + * Function frees memory allocated by ZSTD_createCDict(). + * If a NULL pointer is passed, no operation is performed. */ +ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict); + +/*! ZSTD_compress_usingCDict() : + * Compression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. + * Note : compression level is _decided at dictionary creation time_, + * and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict); + + +typedef struct ZSTD_DDict_s ZSTD_DDict; + +/*! ZSTD_createDDict() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * dictBuffer can be released after DDict creation, as its content is copied inside DDict. */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_freeDDict() : + * Function frees memory allocated with ZSTD_createDDict() + * If a NULL pointer is passed, no operation is performed. */ +ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict); + +/*! ZSTD_decompress_usingDDict() : + * Decompression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_DDict* ddict); + + +/* ****************************** + * Dictionary helper functions + *******************************/ + +/*! ZSTD_getDictID_fromDict() : + * Provides the dictID stored within dictionary. + * if @return == 0, the dictionary is not conformant with Zstandard specification. + * It can still be loaded, but as a content-only dictionary. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); + +/*! ZSTD_getDictID_fromDDict() : + * Provides the dictID of the dictionary loaded into `ddict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + +/*! ZSTD_getDictID_fromFrame() : + * Provides the dictID required to decompressed the frame stored within `src`. + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). + * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. + * When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + + +/* ***************************************************************************** + * Advanced dictionary and prefix API + * + * This API allows dictionaries to be used with ZSTD_compress2(), + * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and + * only reset with the context is reset with ZSTD_reset_parameters or + * ZSTD_reset_session_and_parameters. Prefixes are single-use. + ******************************************************************************/ + + +/*! ZSTD_CCtx_loadDictionary() : + * Create an internal CDict from `dict` buffer. + * Decompression will have to use same dictionary. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. + * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, + * compression parameters can no longer be changed after loading a dictionary. + * Note 3 :`dict` content will be copied internally. + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() + * to precisely select how dictionary content must be interpreted. */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_refCDict() : + * Reference a prepared dictionary, to be used for all next compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. + * The ignored parameters will be used again if the CCtx is returned to no-dictionary mode. + * The dictionary will remain valid for future compressed frames using same CCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Referencing a NULL CDict means "return to no-dictionary mode". + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */ +ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + +/*! ZSTD_CCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) for next compressed frame. + * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end). + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. + * Its content must remain unmodified during compression. + * Note 2 : If the intention is to diff some large src data blob with some prior version of itself, + * ensure that the window size is large enough to contain the entire source. + * See ZSTD_c_windowLog. + * Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters. + * It's a CPU consuming operation, with non-negligible impact on latency. + * If there is a need to use the same prefix multiple times, consider loadDictionary instead. + * Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent). + * Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + +/*! ZSTD_DCtx_loadDictionary() : + * Create an internal DDict from dict buffer, + * to be used to decompress next frames. + * The dictionary remains valid for all future frames, until explicitly invalidated. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Loading a dictionary involves building tables, + * which has a non-negligible impact on CPU usage and latency. + * It's recommended to "load once, use many times", to amortize the cost + * Note 2 :`dict` content will be copied internally, so `dict` can be released after loading. + * Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead. + * Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of + * how dictionary content is loaded and interpreted. + */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_refDDict() : + * Reference a prepared dictionary, to be used to decompress next frames. + * The dictionary remains active for decompression of future frames using same DCtx. + * + * If called with ZSTD_d_refMultipleDDicts enabled, repeated calls of this function + * will store the DDict references in a table, and the DDict used for decompression + * will be determined at decompression time, as per the dict ID in the frame. + * The memory for the table is allocated on the first call to refDDict, and can be + * freed with ZSTD_freeDCtx(). + * + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +/*! ZSTD_DCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) to decompress next frame. + * This is the reverse operation of ZSTD_CCtx_refPrefix(), + * and must use the same prefix as the one used during compression. + * Prefix is **only used once**. Reference is discarded at end of frame. + * End of frame is reached when ZSTD_decompressStream() returns 0. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary + * Note 2 : Prefix buffer is referenced. It **must** outlive decompression. + * Prefix buffer must remain unmodified up to the end of frame, + * reached when ZSTD_decompressStream() returns 0. + * Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent). + * Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section) + * Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost. + * A full dictionary is more costly, as it requires building tables. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, + const void* prefix, size_t prefixSize); + +/* === Memory management === */ + +/*! ZSTD_sizeof_*() : + * These functions give the _current_ memory usage of selected object. + * Note that object memory usage can evolve (increase or decrease) over time. */ +ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs); +ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); +ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); +ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + +#endif /* ZSTD_H_235446 */ + + +/* ************************************************************************************** + * ADVANCED AND EXPERIMENTAL FUNCTIONS + **************************************************************************************** + * The definitions in the following section are considered experimental. + * They are provided for advanced scenarios. + * They should never be used with a dynamic library, as prototypes may change in the future. + * Use them only in association with static linking. + * ***************************************************************************************/ + +#if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) +#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY + +/* ************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** + * The following symbols and constants + * are not planned to join "stable API" status in the near future. + * They can still change in future versions. + * Some of them are planned to remain in the static_only section indefinitely. + * Some of them might be removed in the future (especially when redundant with existing stable functions) + * ***************************************************************************************/ + +#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1) /* minimum input size required to query frame header size */ +#define ZSTD_FRAMEHEADERSIZE_MIN(format) ((format) == ZSTD_f_zstd1 ? 6 : 2) +#define ZSTD_FRAMEHEADERSIZE_MAX 18 /* can be useful for static allocation */ +#define ZSTD_SKIPPABLEHEADERSIZE 8 + +/* compression parameter bounds */ +#define ZSTD_WINDOWLOG_MAX_32 30 +#define ZSTD_WINDOWLOG_MAX_64 31 +#define ZSTD_WINDOWLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64)) +#define ZSTD_WINDOWLOG_MIN 10 +#define ZSTD_HASHLOG_MAX ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30) +#define ZSTD_HASHLOG_MIN 6 +#define ZSTD_CHAINLOG_MAX_32 29 +#define ZSTD_CHAINLOG_MAX_64 30 +#define ZSTD_CHAINLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64)) +#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN +#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) +#define ZSTD_SEARCHLOG_MIN 1 +#define ZSTD_MINMATCH_MAX 7 /* only for ZSTD_fast, other strategies are limited to 6 */ +#define ZSTD_MINMATCH_MIN 3 /* only for ZSTD_btopt+, faster strategies are limited to 4 */ +#define ZSTD_TARGETLENGTH_MAX ZSTD_BLOCKSIZE_MAX +#define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ +#define ZSTD_STRATEGY_MIN ZSTD_fast +#define ZSTD_STRATEGY_MAX ZSTD_btultra2 + + +#define ZSTD_OVERLAPLOG_MIN 0 +#define ZSTD_OVERLAPLOG_MAX 9 + +#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27 /* by default, the streaming decoder will refuse any frame + * requiring larger than (1< 0: + * If litLength != 0: + * rep == 1 --> offset == repeat_offset_1 + * rep == 2 --> offset == repeat_offset_2 + * rep == 3 --> offset == repeat_offset_3 + * If litLength == 0: + * rep == 1 --> offset == repeat_offset_2 + * rep == 2 --> offset == repeat_offset_3 + * rep == 3 --> offset == repeat_offset_1 - 1 + * + * Note: This field is optional. ZSTD_generateSequences() will calculate the value of + * 'rep', but repeat offsets do not necessarily need to be calculated from an external + * sequence provider's perspective. For example, ZSTD_compressSequences() does not + * use this 'rep' field at all (as of now). + */ +} ZSTD_Sequence; + +typedef struct { + unsigned windowLog; /*< largest match distance : larger == more compression, more memory needed during decompression */ + unsigned chainLog; /*< fully searched segment : larger == more compression, slower, more memory (useless for fast) */ + unsigned hashLog; /*< dispatch table : larger == faster, more memory */ + unsigned searchLog; /*< nb of searches : larger == more compression, slower */ + unsigned minMatch; /*< match length searched : larger == faster decompression, sometimes less compression */ + unsigned targetLength; /*< acceptable match size for optimal parser (only) : larger == more compression, slower */ + ZSTD_strategy strategy; /*< see ZSTD_strategy definition above */ +} ZSTD_compressionParameters; + +typedef struct { + int contentSizeFlag; /*< 1: content size will be in frame header (when known) */ + int checksumFlag; /*< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */ + int noDictIDFlag; /*< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */ } ZSTD_frameParameters; -/** - * struct ZSTD_parameters - zstd parameters - * @cParams: The compression parameters. - * @fParams: The frame parameters. - */ typedef struct { - ZSTD_compressionParameters cParams; - ZSTD_frameParameters fParams; + ZSTD_compressionParameters cParams; + ZSTD_frameParameters fParams; } ZSTD_parameters; -/** - * ZSTD_getCParams() - returns ZSTD_compressionParameters for selected level - * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel(). - * @estimatedSrcSize: The estimated source size to compress or 0 if unknown. - * @dictSize: The dictionary size or 0 if a dictionary isn't being used. - * - * Return: The selected ZSTD_compressionParameters. +typedef enum { + ZSTD_dct_auto = 0, /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */ + ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */ + ZSTD_dct_fullDict = 2 /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */ +} ZSTD_dictContentType_e; + +typedef enum { + ZSTD_dlm_byCopy = 0, /*< Copy dictionary content internally */ + ZSTD_dlm_byRef = 1 /*< Reference dictionary content -- the dictionary buffer must outlive its users. */ +} ZSTD_dictLoadMethod_e; + +typedef enum { + ZSTD_f_zstd1 = 0, /* zstd frame format, specified in zstd_compression_format.md (default) */ + ZSTD_f_zstd1_magicless = 1 /* Variant of zstd frame format, without initial 4-bytes magic number. + * Useful to save 4 bytes per generated frame. + * Decoder cannot recognise automatically this format, requiring this instruction. */ +} ZSTD_format_e; + +typedef enum { + /* Note: this enum controls ZSTD_d_forceIgnoreChecksum */ + ZSTD_d_validateChecksum = 0, + ZSTD_d_ignoreChecksum = 1 +} ZSTD_forceIgnoreChecksum_e; + +typedef enum { + /* Note: this enum controls ZSTD_d_refMultipleDDicts */ + ZSTD_rmd_refSingleDDict = 0, + ZSTD_rmd_refMultipleDDicts = 1 +} ZSTD_refMultipleDDicts_e; + +typedef enum { + /* Note: this enum and the behavior it controls are effectively internal + * implementation details of the compressor. They are expected to continue + * to evolve and should be considered only in the context of extremely + * advanced performance tuning. + * + * Zstd currently supports the use of a CDict in three ways: + * + * - The contents of the CDict can be copied into the working context. This + * means that the compression can search both the dictionary and input + * while operating on a single set of internal tables. This makes + * the compression faster per-byte of input. However, the initial copy of + * the CDict's tables incurs a fixed cost at the beginning of the + * compression. For small compressions (< 8 KB), that copy can dominate + * the cost of the compression. + * + * - The CDict's tables can be used in-place. In this model, compression is + * slower per input byte, because the compressor has to search two sets of + * tables. However, this model incurs no start-up cost (as long as the + * working context's tables can be reused). For small inputs, this can be + * faster than copying the CDict's tables. + * + * - The CDict's tables are not used at all, and instead we use the working + * context alone to reload the dictionary and use params based on the source + * size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict(). + * This method is effective when the dictionary sizes are very small relative + * to the input size, and the input size is fairly large to begin with. + * + * Zstd has a simple internal heuristic that selects which strategy to use + * at the beginning of a compression. However, if experimentation shows that + * Zstd is making poor choices, it is possible to override that choice with + * this enum. + */ + ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */ + ZSTD_dictForceAttach = 1, /* Never copy the dictionary. */ + ZSTD_dictForceCopy = 2, /* Always copy the dictionary. */ + ZSTD_dictForceLoad = 3 /* Always reload the dictionary */ +} ZSTD_dictAttachPref_e; + +typedef enum { + ZSTD_lcm_auto = 0, /*< Automatically determine the compression mode based on the compression level. + * Negative compression levels will be uncompressed, and positive compression + * levels will be compressed. */ + ZSTD_lcm_huffman = 1, /*< Always attempt Huffman compression. Uncompressed literals will still be + * emitted if Huffman compression is not profitable. */ + ZSTD_lcm_uncompressed = 2 /*< Always emit uncompressed literals. */ +} ZSTD_literalCompressionMode_e; + + +/* ************************************* +* Frame size functions +***************************************/ + +/*! ZSTD_findDecompressedSize() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - decompressed size of all data in all successive frames + * - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN + * - if an error occurred: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * note 2 : decompressed size is always present when compression is done with ZSTD_compress() + * note 3 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure result fits within application's authorized limits. + * Each application can set its own limits. + * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to + * read each contained frame header. This is fast as most of the data is skipped, + * however it does mean that all frame data must be present and valid. */ +ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_decompressBound() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - upper-bound for the decompressed size of all data in all successive frames + * - if an error occurred: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : an error can occur if `src` contains an invalid or incorrectly formatted frame. + * note 2 : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`. + * in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value. + * note 3 : when the decompressed size field isn't available, the upper-bound for that frame is calculated by: + * upper-bound = # blocks * min(128 KB, Window_Size) */ -ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, - unsigned long long estimatedSrcSize, size_t dictSize); +ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); -/** - * ZSTD_getParams() - returns ZSTD_parameters for selected level - * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel(). - * @estimatedSrcSize: The estimated source size to compress or 0 if unknown. - * @dictSize: The dictionary size or 0 if a dictionary isn't being used. +/*! ZSTD_frameHeaderSize() : + * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. + * @return : size of the Frame Header, + * or an error code (if srcSize is too small) */ +ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + +typedef enum { + ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ + ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ +} ZSTD_sequenceFormat_e; + +/*! ZSTD_generateSequences() : + * Generate sequences using ZSTD_compress2, given a source buffer. + * + * Each block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. + * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) + * simply acts as a block delimiter. * - * The same as ZSTD_getCParams() except also selects the default frame - * parameters (all zero). + * zc can be used to insert custom compression params. + * This function invokes ZSTD_compress2 * - * Return: The selected ZSTD_parameters. + * The output of this function can be fed into ZSTD_compressSequences() with CCtx + * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters + * @return : number of sequences generated */ -ZSTD_parameters ZSTD_getParams(int compressionLevel, - unsigned long long estimatedSrcSize, size_t dictSize); -/*-************************************* - * Explicit memory management - **************************************/ +ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize); -/** - * ZSTD_CCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_CCtx - * @cParams: The compression parameters to be used for compression. +/*! ZSTD_mergeBlockDelimiters() : + * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals + * by merging them into into the literals of the next sequence. * - * If multiple compression parameters might be used, the caller must call - * ZSTD_CCtxWorkspaceBound() for each set of parameters and use the maximum - * size. + * As such, the final generated result has no explicit representation of block boundaries, + * and the final last literals segment is not represented in the sequences. * - * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initCCtx(). + * The output of this function can be fed into ZSTD_compressSequences() with CCtx + * setting of ZSTD_c_blockDelimiters as ZSTD_sf_noBlockDelimiters + * @return : number of sequences left after merging + */ +ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); + +/*! ZSTD_compressSequences() : + * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. + * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) + * The entire source is compressed into a single frame. + * + * The compression behavior changes based on cctx params. In particular: + * If ZSTD_c_blockDelimiters == ZSTD_sf_noBlockDelimiters, the array of ZSTD_Sequence is expected to contain + * no block delimiters (defined in ZSTD_Sequence). Block boundaries are roughly determined based on + * the block size derived from the cctx, and sequences may be split. This is the default setting. + * + * If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain + * block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided. + * + * If ZSTD_c_validateSequences == 0, this function will blindly accept the sequences provided. Invalid sequences cause undefined + * behavior. If ZSTD_c_validateSequences == 1, then if sequence is invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and return an error. + * + * In addition to the two adjustable experimental params, there are other important cctx params. + * - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN. + * - ZSTD_c_compressionLevel accordingly adjusts the strength of the entropy coder, as it would in typical compression. + * - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset + * is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md + * + * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. + * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, + * and cannot emit an RLE block that disagrees with the repcode history + * @return : final compressed size or a ZSTD error. */ -size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize); -/** - * struct ZSTD_CCtx - the zstd compression context + +/*! ZSTD_writeSkippableFrame() : + * Generates a zstd skippable frame containing data given by src, and writes it to dst buffer. * - * When compressing many times it is recommended to allocate a context just once - * and reuse it for each successive compression operation. - */ -typedef struct ZSTD_CCtx_s ZSTD_CCtx; -/** - * ZSTD_initCCtx() - initialize a zstd compression context - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. Use ZSTD_CCtxWorkspaceBound() to - * determine how large the workspace must be. + * Skippable frames begin with a a 4-byte magic number. There are 16 possible choices of magic number, + * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15. + * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so + * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. * - * Return: A compression context emplaced into workspace. - */ -ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize); - -/** - * ZSTD_compressCCtx() - compress src into dst - * @ctx: The context. Must have been initialized with a workspace at - * least as large as ZSTD_CCtxWorkspaceBound(params.cParams). - * @dst: The buffer to compress src into. - * @dstCapacity: The size of the destination buffer. May be any size, but - * ZSTD_compressBound(srcSize) is guaranteed to be large enough. - * @src: The data to compress. - * @srcSize: The size of the data to compress. - * @params: The parameters to use for compression. See ZSTD_getParams(). + * Returns an error if destination buffer is not large enough, if the source size is not representable + * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid). * - * Return: The compressed size or an error, which can be checked using - * ZSTD_isError(). + * @return : number of bytes written or a ZSTD error. */ -size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize, ZSTD_parameters params); - -/** - * ZSTD_DCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_DCtx - * - * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initDCtx(). +ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, + const void* src, size_t srcSize, unsigned magicVariant); + + +/* ************************************* +* Memory management +***************************************/ + +/*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough + * for any compression level up to selected one. + * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate + * does not include space for a window buffer. + * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * + * When srcSize can be bound by a known and rather "small" value, + * this fact can be used to provide a tighter estimation + * because the CCtx compression context will need less memory. + * This tighter estimation can be provided by more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * + * Note 2 : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); + +/*! ZSTD_estimateCStreamSize() : + * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. + * It will also consider src size to be arbitrarily "large", which is worst case. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. + * ZSTD_DStream memory budget depends on window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. + * In this case, get total size by adding ZSTD_estimate?DictSize */ +ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + +/*! ZSTD_estimate?DictSize() : + * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict(). + * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced(). + * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller. + */ +ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); + +/*! ZSTD_initStatic*() : + * Initialize an object using a pre-allocated fixed-size buffer. + * workspace: The memory area to emplace the object into. + * Provided pointer *must be 8-bytes aligned*. + * Buffer must outlive object. + * workspaceSize: Use ZSTD_estimate*Size() to determine + * how large workspace must be to support target scenario. + * @return : pointer to object (same address as workspace, just different type), + * or NULL if error (size too small, incorrect alignment, etc.) + * Note : zstd will never resize nor malloc() when using a static buffer. + * If the object requires more memory than available, + * zstd will just error out (typically ZSTD_error_memory_allocation). + * Note 2 : there is no corresponding "free" function. + * Since workspace is allocated externally, it must be freed externally too. + * Note 3 : cParams : use ZSTD_getCParams() to convert a compression level + * into its associated cParams. + * Limitation 1 : currently not compatible with internal dictionary creation, triggered by + * ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict(). + * Limitation 2 : static cctx currently not compatible with multi-threading. + * Limitation 3 : static dctx is incompatible with legacy support. */ -size_t ZSTD_DCtxWorkspaceBound(void); +ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /*< same as ZSTD_initStaticCCtx() */ + +ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /*< same as ZSTD_initStaticDCtx() */ + +ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams); + +ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType); + + +/*! Custom memory allocation : + * These prototypes make it possible to pass your own allocation/free functions. + * ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below. + * All allocation/free operations will be completed using these custom variants instead of regular ones. + */ +typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size); +typedef void (*ZSTD_freeFunction) (void* opaque, void* address); +typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; +static +__attribute__((__unused__)) +ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /*< this constant defers to stdlib's functions */ + +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, + ZSTD_customMem customMem); + +/* ! Thread pool : + * These prototypes make it possible to share a thread pool among multiple compression contexts. + * This can limit resources for applications with multiple threads where each one uses + * a threaded compression mode (via ZSTD_c_nbWorkers parameter). + * ZSTD_createThreadPool creates a new thread pool with a given number of threads. + * Note that the lifetime of such pool must exist while being used. + * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL argument value + * to use an internal thread pool). + * ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer. + */ +typedef struct POOL_ctx_s ZSTD_threadPool; +ZSTDLIB_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads); +ZSTDLIB_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool); /* accept NULL pointer */ +ZSTDLIB_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool); -/** - * struct ZSTD_DCtx - the zstd decompression context - * - * When decompressing many times it is recommended to allocate a context just - * once and reuse it for each successive decompression operation. + +/* + * This API is temporary and is expected to change or disappear in the future! */ -typedef struct ZSTD_DCtx_s ZSTD_DCtx; -/** - * ZSTD_initDCtx() - initialize a zstd decompression context - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. Use ZSTD_DCtxWorkspaceBound() to - * determine how large the workspace must be. - * - * Return: A decompression context emplaced into workspace. - */ -ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize); - -/** - * ZSTD_decompressDCtx() - decompress zstd compressed src into dst - * @ctx: The decompression context. - * @dst: The buffer to decompress src into. - * @dstCapacity: The size of the destination buffer. Must be at least as large - * as the decompressed size. If the caller cannot upper bound the - * decompressed size, then it's better to use the streaming API. - * @src: The zstd compressed data to decompress. Multiple concatenated - * frames and skippable frames are allowed. - * @srcSize: The exact size of the data to decompress. - * - * Return: The decompressed size or an error, which can be checked using - * ZSTD_isError(). - */ -size_t ZSTD_decompressDCtx(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); - -/*-************************ - * Simple dictionary API - **************************/ - -/** - * ZSTD_compress_usingDict() - compress src into dst using a dictionary - * @ctx: The context. Must have been initialized with a workspace at - * least as large as ZSTD_CCtxWorkspaceBound(params.cParams). - * @dst: The buffer to compress src into. - * @dstCapacity: The size of the destination buffer. May be any size, but - * ZSTD_compressBound(srcSize) is guaranteed to be large enough. - * @src: The data to compress. - * @srcSize: The size of the data to compress. - * @dict: The dictionary to use for compression. - * @dictSize: The size of the dictionary. - * @params: The parameters to use for compression. See ZSTD_getParams(). - * - * Compression using a predefined dictionary. The same dictionary must be used - * during decompression. - * - * Return: The compressed size or an error, which can be checked using - * ZSTD_isError(). - */ -size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize, const void *dict, size_t dictSize, - ZSTD_parameters params); - -/** - * ZSTD_decompress_usingDict() - decompress src into dst using a dictionary - * @ctx: The decompression context. - * @dst: The buffer to decompress src into. - * @dstCapacity: The size of the destination buffer. Must be at least as large - * as the decompressed size. If the caller cannot upper bound the - * decompressed size, then it's better to use the streaming API. - * @src: The zstd compressed data to decompress. Multiple concatenated - * frames and skippable frames are allowed. - * @srcSize: The exact size of the data to decompress. - * @dict: The dictionary to use for decompression. The same dictionary - * must've been used to compress the data. - * @dictSize: The size of the dictionary. - * - * Return: The decompressed size or an error, which can be checked using - * ZSTD_isError(). - */ -size_t ZSTD_decompress_usingDict(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize, const void *dict, size_t dictSize); - -/*-************************** - * Fast dictionary API - ***************************/ - -/** - * ZSTD_CDictWorkspaceBound() - memory needed to initialize a ZSTD_CDict - * @cParams: The compression parameters to be used for compression. - * - * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initCDict(). - */ -size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams); - -/** - * struct ZSTD_CDict - a digested dictionary to be used for compression +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + const ZSTD_CCtx_params* cctxParams, + ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced( + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem); + + +/* ************************************* +* Advanced compression functions +***************************************/ + +/*! ZSTD_createCDict_byReference() : + * Create a digested dictionary for compression + * Dictionary content is just referenced, not duplicated. + * As a consequence, `dictBuffer` **must** outlive CDict, + * and its content must remain unmodified throughout the lifetime of CDict. + * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); + +/*! ZSTD_getDictID_fromCDict() : + * Provides the dictID of the dictionary loaded into `cdict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict); + +/*! ZSTD_getCParams() : + * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize. + * `estimatedSrcSize` value is optional, select 0 if not known */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_getParams() : + * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`. + * All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */ +ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_checkCParams() : + * Ensure param values remain within authorized range. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */ +ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + +/*! ZSTD_adjustCParams() : + * optimize params for a given `srcSize` and `dictSize`. + * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN. + * `dictSize` must be `0` when there is no dictionary. + * cPar can be invalid : all parameters will be clamped within valid range in the @return struct. + * This function never fails (wide contract) */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + +/*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */ +ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params); + +/*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now REDUNDANT. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning in some future version */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams); + + +/*! ZSTD_CCtx_loadDictionary_byReference() : + * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx. + * It saves some memory, but also requires that `dict` outlives its usage within `cctx` */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_loadDictionary_advanced() : + * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_CCtx_refPrefix_advanced() : + * Same as ZSTD_CCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/* === experimental parameters === */ +/* these parameters can be used with ZSTD_setParameter() + * they are not guaranteed to remain supported in the future */ + + /* Enables rsyncable mode, + * which makes compressed files more rsync friendly + * by adding periodic synchronization points to the compressed data. + * The target average block size is ZSTD_c_jobSize / 2. + * It's possible to modify the job size to increase or decrease + * the granularity of the synchronization point. + * Once the jobSize is smaller than the window size, + * it will result in compression ratio degradation. + * NOTE 1: rsyncable mode only works when multithreading is enabled. + * NOTE 2: rsyncable performs poorly in combination with long range mode, + * since it will decrease the effectiveness of synchronization points, + * though mileage may vary. + * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s. + * If the selected compression level is already running significantly slower, + * the overall speed won't be significantly impacted. + */ + #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1 + +/* Select a compression format. + * The value must be of type ZSTD_format_e. + * See ZSTD_format_e enum definition for details */ +#define ZSTD_c_format ZSTD_c_experimentalParam2 + +/* Force back-reference distances to remain < windowSize, + * even when referencing into Dictionary content (default:0) */ +#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3 + +/* Controls whether the contents of a CDict + * are used in place, or copied into the working context. + * Accepts values from the ZSTD_dictAttachPref_e enum. + * See the comments on that enum for an explanation of the feature. */ +#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 + +/* Controls how the literals are compressed (default is auto). + * The value must be of type ZSTD_literalCompressionMode_e. + * See ZSTD_literalCompressionMode_t enum definition for details. */ -typedef struct ZSTD_CDict_s ZSTD_CDict; +#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 + +/* Tries to fit compressed block size to be around targetCBlockSize. + * No target when targetCBlockSize == 0. + * There is no guarantee on compressed block size (default:0) */ +#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 + +/* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, + * but compression ratio may regress significantly if guess considerably underestimates */ +#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7 + +/* Controls whether the new and experimental "dedicated dictionary search + * structure" can be used. This feature is still rough around the edges, be + * prepared for surprising behavior! + * + * How to use it: + * + * When using a CDict, whether to use this feature or not is controlled at + * CDict creation, and it must be set in a CCtxParams set passed into that + * construction (via ZSTD_createCDict_advanced2()). A compression will then + * use the feature or not based on how the CDict was constructed; the value of + * this param, set in the CCtx, will have no effect. + * + * However, when a dictionary buffer is passed into a CCtx, such as via + * ZSTD_CCtx_loadDictionary(), this param can be set on the CCtx to control + * whether the CDict that is created internally can use the feature or not. + * + * What it does: + * + * Normally, the internal data structures of the CDict are analogous to what + * would be stored in a CCtx after compressing the contents of a dictionary. + * To an approximation, a compression using a dictionary can then use those + * data structures to simply continue what is effectively a streaming + * compression where the simulated compression of the dictionary left off. + * Which is to say, the search structures in the CDict are normally the same + * format as in the CCtx. + * + * It is possible to do better, since the CDict is not like a CCtx: the search + * structures are written once during CDict creation, and then are only read + * after that, while the search structures in the CCtx are both read and + * written as the compression goes along. This means we can choose a search + * structure for the dictionary that is read-optimized. + * + * This feature enables the use of that different structure. + * + * Note that some of the members of the ZSTD_compressionParameters struct have + * different semantics and constraints in the dedicated search structure. It is + * highly recommended that you simply set a compression level in the CCtxParams + * you pass into the CDict creation call, and avoid messing with the cParams + * directly. + * + * Effects: + * + * This will only have any effect when the selected ZSTD_strategy + * implementation supports this feature. Currently, that's limited to + * ZSTD_greedy, ZSTD_lazy, and ZSTD_lazy2. + * + * Note that this means that the CDict tables can no longer be copied into the + * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be + * useable. The dictionary can only be attached or reloaded. + * + * In general, you should expect compression to be faster--sometimes very much + * so--and CDict creation to be slightly slower. Eventually, we will probably + * make this mode the default. + */ +#define ZSTD_c_enableDedicatedDictSearch ZSTD_c_experimentalParam8 + +/* ZSTD_c_stableInBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same + * between calls, except for the modifications that zstd makes to pos (the + * caller must not modify pos). This is checked by the compressor, and + * compression will fail if it ever changes. This means the only flush + * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end + * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos) + * MUST not be modified during compression or you will get data corruption. + * + * When this flag is enabled zstd won't allocate an input window buffer, + * because the user guarantees it can reference the ZSTD_inBuffer until + * the frame is complete. But, it will still allocate an output buffer + * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also + * avoid the memcpy() from the input buffer to the input window buffer. + * + * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used. + * That means this flag cannot be used with ZSTD_compressStream(). + * + * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds + * memory. However, compression WILL fail if you violate the preconditions. + * + * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST + * not be modified during compression or you will get data corruption. This + * is because zstd needs to reference data in the ZSTD_inBuffer to find + * matches. Normally zstd maintains its own window buffer for this purpose, + * but passing this flag tells zstd to use the user provided buffer. + */ +#define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 -/** - * ZSTD_initCDict() - initialize a digested dictionary for compression - * @dictBuffer: The dictionary to digest. The buffer is referenced by the - * ZSTD_CDict so it must outlive the returned ZSTD_CDict. - * @dictSize: The size of the dictionary. - * @params: The parameters to use for compression. See ZSTD_getParams(). - * @workspace: The workspace. It must outlive the returned ZSTD_CDict. - * @workspaceSize: The workspace size. Must be at least - * ZSTD_CDictWorkspaceBound(params.cParams). +/* ZSTD_c_stableOutBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. * - * When compressing multiple messages / blocks with the same dictionary it is - * recommended to load it just once. The ZSTD_CDict merely references the - * dictBuffer, so it must outlive the returned ZSTD_CDict. + * Tells he compressor that the ZSTD_outBuffer will not be resized between + * calls. Specifically: (out.size - out.pos) will never grow. This gives the + * compressor the freedom to say: If the compressed data doesn't fit in the + * output buffer then return ZSTD_error_dstSizeTooSmall. This allows us to + * always decompress directly into the output buffer, instead of decompressing + * into an internal buffer and copying to the output buffer. * - * Return: The digested dictionary emplaced into workspace. + * When this flag is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer. It will still allocate the + * input window buffer (see ZSTD_c_stableInBuffer). + * + * Zstd will check that (out.size - out.pos) never grows and return an error + * if it does. While not strictly necessary, this should prevent surprises. */ -ZSTD_CDict *ZSTD_initCDict(const void *dictBuffer, size_t dictSize, - ZSTD_parameters params, void *workspace, size_t workspaceSize); +#define ZSTD_c_stableOutBuffer ZSTD_c_experimentalParam10 -/** - * ZSTD_compress_usingCDict() - compress src into dst using a ZSTD_CDict - * @ctx: The context. Must have been initialized with a workspace at - * least as large as ZSTD_CCtxWorkspaceBound(cParams) where - * cParams are the compression parameters used to initialize the - * cdict. - * @dst: The buffer to compress src into. - * @dstCapacity: The size of the destination buffer. May be any size, but - * ZSTD_compressBound(srcSize) is guaranteed to be large enough. - * @src: The data to compress. - * @srcSize: The size of the data to compress. - * @cdict: The digested dictionary to use for compression. - * @params: The parameters to use for compression. See ZSTD_getParams(). +/* ZSTD_c_blockDelimiters + * Default is 0 == ZSTD_sf_noBlockDelimiters. * - * Compression using a digested dictionary. The same dictionary must be used - * during decompression. + * For use with sequence compression API: ZSTD_compressSequences(). * - * Return: The compressed size or an error, which can be checked using - * ZSTD_isError(). + * Designates whether or not the given array of ZSTD_Sequence contains block delimiters + * and last literals, which are defined as sequences with offset == 0 and matchLength == 0. + * See the definition of ZSTD_Sequence for more specifics. */ -size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize, const ZSTD_CDict *cdict); - +#define ZSTD_c_blockDelimiters ZSTD_c_experimentalParam11 -/** - * ZSTD_DDictWorkspaceBound() - memory needed to initialize a ZSTD_DDict +/* ZSTD_c_validateSequences + * Default is 0 == disabled. Set to 1 to enable sequence validation. + * + * For use with sequence compression API: ZSTD_compressSequences(). + * Designates whether or not we validate sequences provided to ZSTD_compressSequences() + * during function execution. + * + * Without validation, providing a sequence that does not conform to the zstd spec will cause + * undefined behavior, and may produce a corrupted block. + * + * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and + * return an error. * - * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initDDict(). */ -size_t ZSTD_DDictWorkspaceBound(void); +#define ZSTD_c_validateSequences ZSTD_c_experimentalParam12 -/** - * struct ZSTD_DDict - a digested dictionary to be used for decompression +/*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. + * @return : 0, or an error code (which can be tested with ZSTD_isError()). */ -typedef struct ZSTD_DDict_s ZSTD_DDict; - -/** - * ZSTD_initDDict() - initialize a digested dictionary for decompression - * @dictBuffer: The dictionary to digest. The buffer is referenced by the - * ZSTD_DDict so it must outlive the returned ZSTD_DDict. - * @dictSize: The size of the dictionary. - * @workspace: The workspace. It must outlive the returned ZSTD_DDict. - * @workspaceSize: The workspace size. Must be at least - * ZSTD_DDictWorkspaceBound(). - * - * When decompressing multiple messages / blocks with the same dictionary it is - * recommended to load it just once. The ZSTD_DDict merely references the - * dictBuffer, so it must outlive the returned ZSTD_DDict. - * - * Return: The digested dictionary emplaced into workspace. - */ -ZSTD_DDict *ZSTD_initDDict(const void *dictBuffer, size_t dictSize, - void *workspace, size_t workspaceSize); - -/** - * ZSTD_decompress_usingDDict() - decompress src into dst using a ZSTD_DDict - * @ctx: The decompression context. - * @dst: The buffer to decompress src into. - * @dstCapacity: The size of the destination buffer. Must be at least as large - * as the decompressed size. If the caller cannot upper bound the - * decompressed size, then it's better to use the streaming API. - * @src: The zstd compressed data to decompress. Multiple concatenated - * frames and skippable frames are allowed. - * @srcSize: The exact size of the data to decompress. - * @ddict: The digested dictionary to use for decompression. The same - * dictionary must've been used to compress the data. - * - * Return: The decompressed size or an error, which can be checked using - * ZSTD_isError(). - */ -size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst, - size_t dstCapacity, const void *src, size_t srcSize, - const ZSTD_DDict *ddict); - - -/*-************************** - * Streaming - ***************************/ - -/** - * struct ZSTD_inBuffer - input buffer for streaming - * @src: Start of the input buffer. - * @size: Size of the input buffer. - * @pos: Position where reading stopped. Will be updated. - * Necessarily 0 <= pos <= size. +ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); + + +/*! ZSTD_CCtx_params : + * Quick howto : + * - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure + * - ZSTD_CCtxParams_setParameter() : Push parameters one by one into + * an existing ZSTD_CCtx_params structure. + * This is similar to + * ZSTD_CCtx_setParameter(). + * - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to + * an existing CCtx. + * These parameters will be applied to + * all subsequent frames. + * - ZSTD_compressStream2() : Do compression using the CCtx. + * - ZSTD_freeCCtxParams() : Free the memory, accept NULL pointer. + * + * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() + * for static allocation of CCtx for single-threaded compression. */ -typedef struct ZSTD_inBuffer_s { - const void *src; - size_t size; - size_t pos; -} ZSTD_inBuffer; +ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); +ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); /* accept NULL pointer */ -/** - * struct ZSTD_outBuffer - output buffer for streaming - * @dst: Start of the output buffer. - * @size: Size of the output buffer. - * @pos: Position where writing stopped. Will be updated. - * Necessarily 0 <= pos <= size. +/*! ZSTD_CCtxParams_reset() : + * Reset params to default values. */ -typedef struct ZSTD_outBuffer_s { - void *dst; - size_t size; - size_t pos; -} ZSTD_outBuffer; - - +ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); -/*-***************************************************************************** - * Streaming compression - HowTo - * - * A ZSTD_CStream object is required to track streaming operation. - * Use ZSTD_initCStream() to initialize a ZSTD_CStream object. - * ZSTD_CStream objects can be reused multiple times on consecutive compression - * operations. It is recommended to re-use ZSTD_CStream in situations where many - * streaming operations will be achieved consecutively. Use one separate - * ZSTD_CStream per thread for parallel execution. - * - * Use ZSTD_compressStream() repetitively to consume input stream. - * The function will automatically update both `pos` fields. - * Note that it may not consume the entire input, in which case `pos < size`, - * and it's up to the caller to present again remaining data. - * It returns a hint for the preferred number of bytes to use as an input for - * the next function call. - * - * At any moment, it's possible to flush whatever data remains within internal - * buffer, using ZSTD_flushStream(). `output->pos` will be updated. There might - * still be some content left within the internal buffer if `output->size` is - * too small. It returns the number of bytes left in the internal buffer and - * must be called until it returns 0. - * - * ZSTD_endStream() instructs to finish a frame. It will perform a flush and - * write frame epilogue. The epilogue is required for decoders to consider a - * frame completed. Similar to ZSTD_flushStream(), it may not be able to flush - * the full content if `output->size` is too small. In which case, call again - * ZSTD_endStream() to complete the flush. It returns the number of bytes left - * in the internal buffer and must be called until it returns 0. - ******************************************************************************/ +/*! ZSTD_CCtxParams_init() : + * Initializes the compression parameters of cctxParams according to + * compression level. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); -/** - * ZSTD_CStreamWorkspaceBound() - memory needed to initialize a ZSTD_CStream - * @cParams: The compression parameters to be used for compression. - * - * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initCStream() and ZSTD_initCStream_usingCDict(). +/*! ZSTD_CCtxParams_init_advanced() : + * Initializes the compression and frame parameters of cctxParams according to + * params. All other parameters are reset to their default values. */ -size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); + +/*! ZSTD_CCtxParams_setParameter() : + * Similar to ZSTD_CCtx_setParameter. + * Set one compression parameter, selected by enum ZSTD_cParameter. + * Parameters must be applied to a ZSTD_CCtx using + * ZSTD_CCtx_setParametersUsingCCtxParams(). + * @result : a code representing success or failure (which can be tested with + * ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); -/** - * struct ZSTD_CStream - the zstd streaming compression context +/*! ZSTD_CCtxParams_getParameter() : + * Similar to ZSTD_CCtx_getParameter. + * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); + +/*! ZSTD_CCtx_setParametersUsingCCtxParams() : + * Apply a set of ZSTD_CCtx_params to the compression context. + * This can be done even after compression is started, + * if nbWorkers==0, this will have no impact until a new compression is started. + * if nbWorkers>=1, new parameters will be picked up at next job, + * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( + ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params); + +/*! ZSTD_compressStream2_simpleArgs() : + * Same as ZSTD_compressStream2(), + * but using only integral types as arguments. + * This variant might be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. */ -typedef struct ZSTD_CStream_s ZSTD_CStream; +ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp); + + +/* ************************************* +* Advanced decompression functions +***************************************/ + +/*! ZSTD_isFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. + * Note 3 : Skippable Frame Identifiers are considered valid. */ +ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size); + +/*! ZSTD_createDDict_byReference() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * Dictionary content is referenced, and therefore stays in dictBuffer. + * It is important that dictBuffer outlives DDict, + * it must remain read accessible throughout the lifetime of DDict */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_byReference() : + * Same as ZSTD_DCtx_loadDictionary(), + * but references `dict` content instead of copying it into `dctx`. + * This saves memory if `dict` remains around., + * However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_advanced() : + * Same as ZSTD_DCtx_loadDictionary(), + * but gives direct control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?). */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_refPrefix_advanced() : + * Same as ZSTD_DCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_setMaxWindowSize() : + * Refuses allocating internal buffers for frames requiring a window size larger than provided limit. + * This protects a decoder context from reserving too much memory for itself (potential attack scenario). + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); -/*===== ZSTD_CStream management functions =====*/ -/** - * ZSTD_initCStream() - initialize a zstd streaming compression context - * @params: The zstd compression parameters. - * @pledgedSrcSize: If params.fParams.contentSizeFlag == 1 then the caller must - * pass the source size (zero means empty source). Otherwise, - * the caller may optionally pass the source size, or zero if - * unknown. - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. - * Use ZSTD_CStreamWorkspaceBound(params.cParams) to determine - * how large the workspace must be. - * - * Return: The zstd streaming compression context. - */ -ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params, - unsigned long long pledgedSrcSize, void *workspace, - size_t workspaceSize); - -/** - * ZSTD_initCStream_usingCDict() - initialize a streaming compression context - * @cdict: The digested dictionary to use for compression. - * @pledgedSrcSize: Optionally the source size, or zero if unknown. - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. Call ZSTD_CStreamWorkspaceBound() - * with the cParams used to initialize the cdict to determine - * how large the workspace must be. - * - * Return: The zstd streaming compression context. - */ -ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict, - unsigned long long pledgedSrcSize, void *workspace, - size_t workspaceSize); +/*! ZSTD_DCtx_getParameter() : + * Get the requested decompression parameter value, selected by enum ZSTD_dParameter, + * and store it into int* value. + * @return : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value); -/*===== Streaming compression functions =====*/ -/** - * ZSTD_resetCStream() - reset the context using parameters from creation - * @zcs: The zstd streaming compression context to reset. - * @pledgedSrcSize: Optionally the source size, or zero if unknown. - * - * Resets the context using the parameters from creation. Skips dictionary - * loading, since it can be reused. If `pledgedSrcSize` is non-zero the frame - * content size is always written into the frame header. - * - * Return: Zero or an error, which can be checked using ZSTD_isError(). +/* ZSTD_d_format + * experimental parameter, + * allowing selection between ZSTD_format_e input compression formats */ -size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize); -/** - * ZSTD_compressStream() - streaming compress some of input into output - * @zcs: The zstd streaming compression context. - * @output: Destination buffer. `output->pos` is updated to indicate how much - * compressed data was written. - * @input: Source buffer. `input->pos` is updated to indicate how much data was - * read. Note that it may not consume the entire input, in which case - * `input->pos < input->size`, and it's up to the caller to present - * remaining data again. - * - * The `input` and `output` buffers may be any size. Guaranteed to make some - * forward progress if `input` and `output` are not empty. - * - * Return: A hint for the number of bytes to use as the input for the next - * function call or an error, which can be checked using - * ZSTD_isError(). +#define ZSTD_d_format ZSTD_d_experimentalParam1 +/* ZSTD_d_stableOutBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same + * between calls, except for the modifications that zstd makes to pos (the + * caller must not modify pos). This is checked by the decompressor, and + * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer + * MUST be large enough to fit the entire decompressed frame. This will be + * checked when the frame content size is known. The data in the ZSTD_outBuffer + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * + * When this flags is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. + * If you need to avoid the input buffer allocation use the buffer-less + * streaming API. + * + * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds + * memory. However, decompression WILL fail if you violate the preconditions. + * + * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST + * not be modified during decompression or you will get data corruption. This + * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate + * matches. Normally zstd maintains its own buffer for this purpose, but passing + * this flag tells zstd to use the user provided buffer. */ -size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output, - ZSTD_inBuffer *input); -/** - * ZSTD_flushStream() - flush internal buffers into output - * @zcs: The zstd streaming compression context. - * @output: Destination buffer. `output->pos` is updated to indicate how much - * compressed data was written. - * - * ZSTD_flushStream() must be called until it returns 0, meaning all the data - * has been flushed. Since ZSTD_flushStream() causes a block to be ended, - * calling it too often will degrade the compression ratio. +#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2 + +/* ZSTD_d_forceIgnoreChecksum + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable * - * Return: The number of bytes still present within internal buffers or an - * error, which can be checked using ZSTD_isError(). + * Tells the decompressor to skip checksum validation during decompression, regardless + * of whether checksumming was specified during compression. This offers some + * slight performance benefits, and may be useful for debugging. + * Param has values of type ZSTD_forceIgnoreChecksum_e */ -size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output); -/** - * ZSTD_endStream() - flush internal buffers into output and end the frame - * @zcs: The zstd streaming compression context. - * @output: Destination buffer. `output->pos` is updated to indicate how much - * compressed data was written. +#define ZSTD_d_forceIgnoreChecksum ZSTD_d_experimentalParam3 + +/* ZSTD_d_refMultipleDDicts + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable * - * ZSTD_endStream() must be called until it returns 0, meaning all the data has - * been flushed and the frame epilogue has been written. + * If enabled and dctx is allocated on the heap, then additional memory will be allocated + * to store references to multiple ZSTD_DDict. That is, multiple calls of ZSTD_refDDict() + * using a given ZSTD_DCtx, rather than overwriting the previous DDict reference, will instead + * store all references. At decompression time, the appropriate dictID is selected + * from the set of DDicts based on the dictID in the frame. * - * Return: The number of bytes still present within internal buffers or an - * error, which can be checked using ZSTD_isError(). - */ -size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output); - -/** - * ZSTD_CStreamInSize() - recommended size for the input buffer + * Usage is simply calling ZSTD_refDDict() on multiple dict buffers. * - * Return: The recommended size for the input buffer. - */ -size_t ZSTD_CStreamInSize(void); -/** - * ZSTD_CStreamOutSize() - recommended size for the output buffer + * Param has values of byte ZSTD_refMultipleDDicts_e * - * When the output buffer is at least this large, it is guaranteed to be large - * enough to flush at least one complete compressed block. + * WARNING: Enabling this parameter and calling ZSTD_DCtx_refDDict(), will trigger memory + * allocation for the hash table. ZSTD_freeDCtx() also frees this memory. + * Memory is allocated as per ZSTD_DCtx::customMem. * - * Return: The recommended size for the output buffer. + * Although this function allocates memory for the table, the user is still responsible for + * memory management of the underlying ZSTD_DDict* themselves. */ -size_t ZSTD_CStreamOutSize(void); - +#define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 -/*-***************************************************************************** - * Streaming decompression - HowTo - * - * A ZSTD_DStream object is required to track streaming operations. - * Use ZSTD_initDStream() to initialize a ZSTD_DStream object. - * ZSTD_DStream objects can be re-used multiple times. - * - * Use ZSTD_decompressStream() repetitively to consume your input. - * The function will update both `pos` fields. - * If `input->pos < input->size`, some input has not been consumed. - * It's up to the caller to present again remaining data. - * If `output->pos < output->size`, decoder has flushed everything it could. - * Returns 0 iff a frame is completely decoded and fully flushed. - * Otherwise it returns a suggested next input size that will never load more - * than the current frame. - ******************************************************************************/ +/*! ZSTD_DCtx_setFormat() : + * Instruct the decoder context about what kind of data to decode next. + * This instruction is mandatory to decode data without a fully-formed header, + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); -/** - * ZSTD_DStreamWorkspaceBound() - memory needed to initialize a ZSTD_DStream - * @maxWindowSize: The maximum window size allowed for compressed frames. - * - * Return: A lower bound on the size of the workspace that is passed to - * ZSTD_initDStream() and ZSTD_initDStream_usingDDict(). +/*! ZSTD_decompressStream_simpleArgs() : + * Same as ZSTD_decompressStream(), + * but using only integral types as arguments. + * This can be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. */ -size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize); +ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( + ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos); + + +/* ****************************************************************** +* Advanced streaming functions +* Warning : most of these functions are now redundant with the Advanced API. +* Once Advanced API reaches "stable" status, +* redundant functions will be deprecated, and then at some point removed. +********************************************************************/ + +/*===== Advanced Streaming compression functions =====*/ + +/*! ZSTD_initCStream_srcSize() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * pledgedSrcSize must be correct. If it is not known at init time, use + * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, + * "0" also disables frame content size field. It may be enabled in the future. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); + +/*! ZSTD_initCStream_usingDict() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * Creates of an internal CDict (incompatible with static CCtx), except if + * dict == NULL or dictSize < 8, in which case no dict is used. + * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if + * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + +/*! ZSTD_initCStream_advanced() : + * This function is deprecated, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd parameter and leave the rest as-is. + * for ((param, value) : params) { + * ZSTD_CCtx_setParameter(zcs, param, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy. + * pledgedSrcSize must be correct. + * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, + unsigned long long pledgedSrcSize); + +/*! ZSTD_initCStream_usingCDict() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * note : cdict will just be referenced, and must outlive compression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + +/*! ZSTD_initCStream_usingCDict_advanced() : + * This function is DEPRECATED, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. + * for ((fParam, value) : fParams) { + * ZSTD_CCtx_setParameter(zcs, fParam, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * same as ZSTD_initCStream_usingCDict(), with control over frame parameters. + * pledgedSrcSize must be correct. If srcSize is not known at init time, use + * value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, + unsigned long long pledgedSrcSize); + +/*! ZSTD_resetCStream() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * start a new frame, using same parameters from previous frame. + * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. + * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs, + * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); -/** - * struct ZSTD_DStream - the zstd streaming decompression context + +typedef struct { + unsigned long long ingested; /* nb input bytes read and buffered */ + unsigned long long consumed; /* nb input bytes actually compressed */ + unsigned long long produced; /* nb of compressed bytes generated and buffered */ + unsigned long long flushed; /* nb of compressed bytes flushed : not provided; can be tracked from caller side */ + unsigned currentJobID; /* MT only : latest started job nb */ + unsigned nbActiveWorkers; /* MT only : nb of workers actively compressing at probe time */ +} ZSTD_frameProgression; + +/* ZSTD_getFrameProgression() : + * tells how much data has been ingested (read from input) + * consumed (input actually compressed) and produced (output) for current frame. + * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed. + * Aggregates progression inside active worker threads. */ -typedef struct ZSTD_DStream_s ZSTD_DStream; -/*===== ZSTD_DStream management functions =====*/ -/** - * ZSTD_initDStream() - initialize a zstd streaming decompression context - * @maxWindowSize: The maximum window size allowed for compressed frames. - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. - * Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine - * how large the workspace must be. - * - * Return: The zstd streaming decompression context. - */ -ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace, - size_t workspaceSize); -/** - * ZSTD_initDStream_usingDDict() - initialize streaming decompression context - * @maxWindowSize: The maximum window size allowed for compressed frames. - * @ddict: The digested dictionary to use for decompression. - * @workspace: The workspace to emplace the context into. It must outlive - * the returned context. - * @workspaceSize: The size of workspace. - * Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine - * how large the workspace must be. - * - * Return: The zstd streaming decompression context. - */ -ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize, - const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); + +/*! ZSTD_toFlushNow() : + * Tell how many bytes are ready to be flushed immediately. + * Useful for multithreading scenarios (nbWorkers >= 1). + * Probe the oldest active job, defined as oldest job not yet entirely flushed, + * and check its output buffer. + * @return : amount of data stored in oldest job and ready to be flushed immediately. + * if @return == 0, it means either : + * + there is no active job (could be checked with ZSTD_frameProgression()), or + * + oldest job is still actively compressing data, + * but everything it has produced has also been flushed so far, + * therefore flush speed is limited by production speed of oldest job + * irrespective of the speed of concurrent (and newer) jobs. + */ +ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); -/*===== Streaming decompression functions =====*/ -/** - * ZSTD_resetDStream() - reset the context using parameters from creation - * @zds: The zstd streaming decompression context to reset. + +/*===== Advanced Streaming decompression functions =====*/ + +/*! + * This function is deprecated, and is equivalent to: * - * Resets the context using the parameters from creation. Skips dictionary - * loading, since it can be reused. + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); * - * Return: Zero or an error, which can be checked using ZSTD_isError(). + * note: no dictionary will be used if dict == NULL or dictSize < 8 + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x */ -size_t ZSTD_resetDStream(ZSTD_DStream *zds); -/** - * ZSTD_decompressStream() - streaming decompress some of input into output - * @zds: The zstd streaming decompression context. - * @output: Destination buffer. `output.pos` is updated to indicate how much - * decompressed data was written. - * @input: Source buffer. `input.pos` is updated to indicate how much data was - * read. Note that it may not consume the entire input, in which case - * `input.pos < input.size`, and it's up to the caller to present - * remaining data again. +ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + +/*! + * This function is deprecated, and is equivalent to: * - * The `input` and `output` buffers may be any size. Guaranteed to make some - * forward progress if `input` and `output` are not empty. - * ZSTD_decompressStream() will not consume the last byte of the frame until - * the entire frame is flushed. + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, ddict); * - * Return: Returns 0 iff a frame is completely decoded and fully flushed. - * Otherwise returns a hint for the number of bytes to use as the input - * for the next function call or an error, which can be checked using - * ZSTD_isError(). The size hint will never load more than the frame. + * note : ddict is referenced, it must outlive decompression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x */ -size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output, - ZSTD_inBuffer *input); +ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); -/** - * ZSTD_DStreamInSize() - recommended size for the input buffer - * - * Return: The recommended size for the input buffer. - */ -size_t ZSTD_DStreamInSize(void); -/** - * ZSTD_DStreamOutSize() - recommended size for the output buffer +/*! + * This function is deprecated, and is equivalent to: * - * When the output buffer is at least this large, it is guaranteed to be large - * enough to flush at least one complete decompressed block. + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); * - * Return: The recommended size for the output buffer. + * re-use decompression parameters from previous init; saves dictionary loading + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x */ -size_t ZSTD_DStreamOutSize(void); +ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); -/* --- Constants ---*/ -#define ZSTD_MAGICNUMBER 0xFD2FB528 /* >= v0.8.0 */ -#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50U +/* ******************************************************************* +* Buffer-less and synchronous inner streaming functions +* +* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +* But it's also a complex one, with several restrictions, documented below. +* Prefer normal streaming API for an easier experience. +********************************************************************* */ -#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) -#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) +/* + Buffer-less streaming compression (synchronous mode) + + A ZSTD_CCtx object is required to track streaming operations. + Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. + ZSTD_CCtx object can be re-used multiple times within successive compression operations. + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression, + or ZSTD_compressBegin_advanced(), for finer parameter control. + It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : + - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only. + - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks. + - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario. + Worst case evaluation is provided by ZSTD_compressBound(). + ZSTD_compressContinue() doesn't guarantee recover after a failed compression. + - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog). + It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks) + - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps. + In which case, it will "discard" the relevant memory section from its history. + + Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum. + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. + + `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. +*/ -#define ZSTD_WINDOWLOG_MAX_32 27 -#define ZSTD_WINDOWLOG_MAX_64 27 -#define ZSTD_WINDOWLOG_MAX \ - ((unsigned int)(sizeof(size_t) == 4 \ - ? ZSTD_WINDOWLOG_MAX_32 \ - : ZSTD_WINDOWLOG_MAX_64)) -#define ZSTD_WINDOWLOG_MIN 10 -#define ZSTD_HASHLOG_MAX ZSTD_WINDOWLOG_MAX -#define ZSTD_HASHLOG_MIN 6 -#define ZSTD_CHAINLOG_MAX (ZSTD_WINDOWLOG_MAX+1) -#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN -#define ZSTD_HASHLOG3_MAX 17 -#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) -#define ZSTD_SEARCHLOG_MIN 1 -/* only for ZSTD_fast, other strategies are limited to 6 */ -#define ZSTD_SEARCHLENGTH_MAX 7 -/* only for ZSTD_btopt, other strategies are limited to 4 */ -#define ZSTD_SEARCHLENGTH_MIN 3 -#define ZSTD_TARGETLENGTH_MIN 4 -#define ZSTD_TARGETLENGTH_MAX 999 - -/* for static allocation */ -#define ZSTD_FRAMEHEADERSIZE_MAX 18 -#define ZSTD_FRAMEHEADERSIZE_MIN 6 -#define ZSTD_frameHeaderSize_prefix 5 -#define ZSTD_frameHeaderSize_min ZSTD_FRAMEHEADERSIZE_MIN -#define ZSTD_frameHeaderSize_max ZSTD_FRAMEHEADERSIZE_MAX -/* magic number + skippable frame length */ -#define ZSTD_skippableHeaderSize 8 - - -/*-************************************* - * Compressed size functions - **************************************/ - -/** - * ZSTD_findFrameCompressedSize() - returns the size of a compressed frame - * @src: Source buffer. It should point to the start of a zstd encoded frame - * or a skippable frame. - * @srcSize: The size of the source buffer. It must be at least as large as the - * size of the frame. - * - * Return: The compressed size of the frame pointed to by `src` or an error, - * which can be check with ZSTD_isError(). - * Suitable to pass to ZSTD_decompress() or similar functions. - */ -size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize); - -/*-************************************* - * Decompressed size functions - **************************************/ -/** - * ZSTD_getFrameContentSize() - returns the content size in a zstd frame header - * @src: It should point to the start of a zstd encoded frame. - * @srcSize: The size of the source buffer. It must be at least as large as the - * frame header. `ZSTD_frameHeaderSize_max` is always large enough. - * - * Return: The frame content size stored in the frame header if known. - * `ZSTD_CONTENTSIZE_UNKNOWN` if the content size isn't stored in the - * frame header. `ZSTD_CONTENTSIZE_ERROR` on invalid input. - */ -unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); - -/** - * ZSTD_findDecompressedSize() - returns decompressed size of a series of frames - * @src: It should point to the start of a series of zstd encoded and/or - * skippable frames. - * @srcSize: The exact size of the series of frames. - * - * If any zstd encoded frame in the series doesn't have the frame content size - * set, `ZSTD_CONTENTSIZE_UNKNOWN` is returned. But frame content size is always - * set when using ZSTD_compress(). The decompressed size can be very large. - * If the source is untrusted, the decompressed size could be wrong or - * intentionally modified. Always ensure the result fits within the - * application's authorized limits. ZSTD_findDecompressedSize() handles multiple - * frames, and so it must traverse the input to read each frame header. This is - * efficient as most of the data is skipped, however it does mean that all frame - * data must be present and valid. - * - * Return: Decompressed size of all the data contained in the frames if known. - * `ZSTD_CONTENTSIZE_UNKNOWN` if the decompressed size is unknown. - * `ZSTD_CONTENTSIZE_ERROR` if an error occurred. - */ -unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize); - -/*-************************************* - * Advanced compression functions - **************************************/ -/** - * ZSTD_checkCParams() - ensure parameter values remain within authorized range - * @cParams: The zstd compression parameters. - * - * Return: Zero or an error, which can be checked using ZSTD_isError(). - */ -size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams); - -/** - * ZSTD_adjustCParams() - optimize parameters for a given srcSize and dictSize - * @srcSize: Optionally the estimated source size, or zero if unknown. - * @dictSize: Optionally the estimated dictionary size, or zero if unknown. - * - * Return: The optimized parameters. - */ -ZSTD_compressionParameters ZSTD_adjustCParams( - ZSTD_compressionParameters cParams, unsigned long long srcSize, - size_t dictSize); - -/*--- Advanced decompression functions ---*/ - -/** - * ZSTD_isFrame() - returns true iff the buffer starts with a valid frame - * @buffer: The source buffer to check. - * @size: The size of the source buffer, must be at least 4 bytes. - * - * Return: True iff the buffer starts with a zstd or skippable frame identifier. - */ -unsigned int ZSTD_isFrame(const void *buffer, size_t size); - -/** - * ZSTD_getDictID_fromDict() - returns the dictionary id stored in a dictionary - * @dict: The dictionary buffer. - * @dictSize: The size of the dictionary buffer. - * - * Return: The dictionary id stored within the dictionary or 0 if the - * dictionary is not a zstd dictionary. If it returns 0 the - * dictionary can still be loaded as a content-only dictionary. - */ -unsigned int ZSTD_getDictID_fromDict(const void *dict, size_t dictSize); +/*===== Buffer-less streaming compression functions =====*/ +ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ -/** - * ZSTD_getDictID_fromDDict() - returns the dictionary id stored in a ZSTD_DDict - * @ddict: The ddict to find the id of. - * - * Return: The dictionary id stored within `ddict` or 0 if the dictionary is not - * a zstd dictionary. If it returns 0 `ddict` will be loaded as a - * content-only dictionary. - */ -unsigned int ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict); +ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -/** - * ZSTD_getDictID_fromFrame() - returns the dictionary id stored in a zstd frame - * @src: Source buffer. It must be a zstd encoded frame. - * @srcSize: The size of the source buffer. It must be at least as large as the - * frame header. `ZSTD_frameHeaderSize_max` is always large enough. - * - * Return: The dictionary id required to decompress the frame stored within - * `src` or 0 if the dictionary id could not be decoded. It can return - * 0 if the frame does not require a dictionary, the dictionary id - * wasn't stored in the frame, `src` is not a zstd frame, or `srcSize` - * is too small. - */ -unsigned int ZSTD_getDictID_fromFrame(const void *src, size_t srcSize); -/** - * struct ZSTD_frameParams - zstd frame parameters stored in the frame header - * @frameContentSize: The frame content size, or 0 if not present. - * @windowSize: The window size, or 0 if the frame is a skippable frame. - * @dictID: The dictionary id, or 0 if not present. - * @checksumFlag: Whether a checksum was used. - */ +/* + Buffer-less streaming decompression (synchronous mode) + + A ZSTD_DCtx object is required to track streaming operations. + Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. + A ZSTD_DCtx object can be re-used multiple times. + + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. + @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. + >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, + such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). + Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. + As a consequence, check that values remain within valid application range. + For example, do not allocate memory blindly, check that `windowSize` is within expectation. + Each application can set its own limits, depending on local restrictions. + For extended interoperability, it is recommended to support `windowSize` of at least 8 MB. + + ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes. + ZSTD_decompressContinue() is very sensitive to contiguity, + if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place, + or that previous contiguous segment is large enough to properly handle maximum back-reference distance. + There are multiple ways to guarantee this condition. + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), + which can @return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. + At which point, decoding can resume from the beginning of the buffer. + Note that already decoded data stored in the buffer should be flushed before being overwritten. + + There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory. + + Finally, if you control the compression process, you can also ignore all buffer size rules, + as long as the encoder and decoder progress in "lock-step", + aka use exactly the same buffer sizes, break contiguity at the same place, etc. + + Once buffers are setup, start decompression, with ZSTD_decompressBegin(). + If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict(). + + Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively. + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + + @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + + A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero. + Context can then be reset to start a new decompression. + + Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType(). + This information is not required to properly decode a frame. + + == Special case : skippable frames == + + Skippable frames allow integration of user-defined data into a flow of concatenated frames. + Skippable frames will be ignored (skipped) by decompressor. + The format of skippable frames is as follows : + a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F + b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits + c) Frame Content - any content (User Data) of length equal to Frame Size + For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame. + For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content. +*/ + +/*===== Buffer-less streaming decompression functions =====*/ +typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; typedef struct { - unsigned long long frameContentSize; - unsigned int windowSize; - unsigned int dictID; - unsigned int checksumFlag; -} ZSTD_frameParams; - -/** - * ZSTD_getFrameParams() - extracts parameters from a zstd or skippable frame - * @fparamsPtr: On success the frame parameters are written here. - * @src: The source buffer. It must point to a zstd or skippable frame. - * @srcSize: The size of the source buffer. `ZSTD_frameHeaderSize_max` is - * always large enough to succeed. - * - * Return: 0 on success. If more data is required it returns how many bytes - * must be provided to make forward progress. Otherwise it returns - * an error, which can be checked using ZSTD_isError(). - */ -size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src, - size_t srcSize); - -/*-***************************************************************************** - * Buffer-less and synchronous inner streaming functions - * - * This is an advanced API, giving full control over buffer management, for - * users which need direct control over memory. - * But it's also a complex one, with many restrictions (documented below). - * Prefer using normal streaming API for an easier experience - ******************************************************************************/ + unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ + unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ + unsigned blockSizeMax; + ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ + unsigned headerSize; + unsigned dictID; + unsigned checksumFlag; +} ZSTD_frameHeader; + +/*! ZSTD_getFrameHeader() : + * decode Frame Header, or requires larger `srcSize`. + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ +/*! ZSTD_getFrameHeader_advanced() : + * same as ZSTD_getFrameHeader(), + * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); +ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +/* misc */ +ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); +typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; +ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + + +/* ============================ */ +/* Block level API */ +/* ============================ */ + +/*! + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. + + A few rules to respect : + - Compressing and decompressing require a context structure + + Use ZSTD_createCCtx() and ZSTD_createDCtx() + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary + + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. + Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block. + - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) ! + ===> In which case, nothing is produced into `dst` ! + + User __must__ test for such outcome and deal directly with uncompressed data + + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0. + Doing so would mess up with statistics history, leading to potential data corruption. + + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !! + + In case of multiple successive blocks, should some of them be uncompressed, + decoder must be informed of their existence in order to follow proper history. + Use ZSTD_insertBlock() for such a case. +*/ -/*-***************************************************************************** - * Buffer-less streaming compression (synchronous mode) - * - * A ZSTD_CCtx object is required to track streaming operations. - * Use ZSTD_initCCtx() to initialize a context. - * ZSTD_CCtx object can be re-used multiple times within successive compression - * operations. - * - * Start by initializing a context. - * Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary - * compression, - * or ZSTD_compressBegin_advanced(), for finer parameter control. - * It's also possible to duplicate a reference context which has already been - * initialized, using ZSTD_copyCCtx() - * - * Then, consume your input using ZSTD_compressContinue(). - * There are some important considerations to keep in mind when using this - * advanced function : - * - ZSTD_compressContinue() has no internal buffer. It uses externally provided - * buffer only. - * - Interface is synchronous : input is consumed entirely and produce 1+ - * (or more) compressed blocks. - * - Caller must ensure there is enough space in `dst` to store compressed data - * under worst case scenario. Worst case evaluation is provided by - * ZSTD_compressBound(). - * ZSTD_compressContinue() doesn't guarantee recover after a failed - * compression. - * - ZSTD_compressContinue() presumes prior input ***is still accessible and - * unmodified*** (up to maximum distance size, see WindowLog). - * It remembers all previous contiguous blocks, plus one separated memory - * segment (which can itself consists of multiple contiguous blocks) - * - ZSTD_compressContinue() detects that prior input has been overwritten when - * `src` buffer overlaps. In which case, it will "discard" the relevant memory - * section from its history. - * - * Finish a frame with ZSTD_compressEnd(), which will write the last block(s) - * and optional checksum. It's possible to use srcSize==0, in which case, it - * will write a final empty block to end the frame. Without last block mark, - * frames will be considered unfinished (corrupted) by decoders. - * - * `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new - * frame. - ******************************************************************************/ +/*===== Raw zstd block functions =====*/ +ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ -/*===== Buffer-less streaming compression functions =====*/ -size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel); -size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict, - size_t dictSize, int compressionLevel); -size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict, - size_t dictSize, ZSTD_parameters params, - unsigned long long pledgedSrcSize); -size_t ZSTD_copyCCtx(ZSTD_CCtx *cctx, const ZSTD_CCtx *preparedCCtx, - unsigned long long pledgedSrcSize); -size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict, - unsigned long long pledgedSrcSize); -size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); -size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); - - - -/*-***************************************************************************** - * Buffer-less streaming decompression (synchronous mode) - * - * A ZSTD_DCtx object is required to track streaming operations. - * Use ZSTD_initDCtx() to initialize a context. - * A ZSTD_DCtx object can be re-used multiple times. - * - * First typical operation is to retrieve frame parameters, using - * ZSTD_getFrameParams(). It fills a ZSTD_frameParams structure which provide - * important information to correctly decode the frame, such as the minimum - * rolling buffer size to allocate to decompress data (`windowSize`), and the - * dictionary ID used. - * Note: content size is optional, it may not be present. 0 means unknown. - * Note that these values could be wrong, either because of data malformation, - * or because an attacker is spoofing deliberate false information. As a - * consequence, check that values remain within valid application range, - * especially `windowSize`, before allocation. Each application can set its own - * limit, depending on local restrictions. For extended interoperability, it is - * recommended to support at least 8 MB. - * Frame parameters are extracted from the beginning of the compressed frame. - * Data fragment must be large enough to ensure successful decoding, typically - * `ZSTD_frameHeaderSize_max` bytes. - * Result: 0: successful decoding, the `ZSTD_frameParams` structure is filled. - * >0: `srcSize` is too small, provide at least this many bytes. - * errorCode, which can be tested using ZSTD_isError(). - * - * Start decompression, with ZSTD_decompressBegin() or - * ZSTD_decompressBegin_usingDict(). Alternatively, you can copy a prepared - * context, using ZSTD_copyDCtx(). - * - * Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() - * alternatively. - * ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' - * to ZSTD_decompressContinue(). - * ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will - * fail. - * - * The result of ZSTD_decompressContinue() is the number of bytes regenerated - * within 'dst' (necessarily <= dstCapacity). It can be zero, which is not an - * error; it just means ZSTD_decompressContinue() has decoded some metadata - * item. It can also be an error code, which can be tested with ZSTD_isError(). - * - * ZSTD_decompressContinue() needs previous data blocks during decompression, up - * to `windowSize`. They should preferably be located contiguously, prior to - * current block. Alternatively, a round buffer of sufficient size is also - * possible. Sufficient size is determined by frame parameters. - * ZSTD_decompressContinue() is very sensitive to contiguity, if 2 blocks don't - * follow each other, make sure that either the compressor breaks contiguity at - * the same place, or that previous contiguous segment is large enough to - * properly handle maximum back-reference. - * - * A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero. - * Context can then be reset to start a new decompression. - * - * Note: it's possible to know if next input to present is a header or a block, - * using ZSTD_nextInputType(). This information is not required to properly - * decode a frame. - * - * == Special case: skippable frames == - * - * Skippable frames allow integration of user-defined data into a flow of - * concatenated frames. Skippable frames will be ignored (skipped) by a - * decompressor. The format of skippable frames is as follows: - * a) Skippable frame ID - 4 Bytes, Little endian format, any value from - * 0x184D2A50 to 0x184D2A5F - * b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits - * c) Frame Content - any content (User Data) of length equal to Frame Size - * For skippable frames ZSTD_decompressContinue() always returns 0. - * For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0 - * what means that a frame is skippable. - * Note: If fparamsPtr->frameContentSize==0, it is ambiguous: the frame might - * actually be a zstd encoded frame with no content. For purposes of - * decompression, it is valid in both cases to skip the frame using - * ZSTD_findFrameCompressedSize() to find its size in bytes. - * It also returns frame size as fparamsPtr->frameContentSize. - ******************************************************************************/ -/*===== Buffer-less streaming decompression functions =====*/ -size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx); -size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict, - size_t dictSize); -void ZSTD_copyDCtx(ZSTD_DCtx *dctx, const ZSTD_DCtx *preparedDCtx); -size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx); -size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); -typedef enum { - ZSTDnit_frameHeader, - ZSTDnit_blockHeader, - ZSTDnit_block, - ZSTDnit_lastBlock, - ZSTDnit_checksum, - ZSTDnit_skippableFrame -} ZSTD_nextInputType_e; -ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx); - -/*-***************************************************************************** - * Block functions - * - * Block functions produce and decode raw zstd blocks, without frame metadata. - * Frame metadata cost is typically ~18 bytes, which can be non-negligible for - * very small blocks (< 100 bytes). User will have to take in charge required - * information to regenerate data, such as compressed and content sizes. - * - * A few rules to respect: - * - Compressing and decompressing require a context structure - * + Use ZSTD_initCCtx() and ZSTD_initDCtx() - * - It is necessary to init context before starting - * + compression : ZSTD_compressBegin() - * + decompression : ZSTD_decompressBegin() - * + variants _usingDict() are also allowed - * + copyCCtx() and copyDCtx() work too - * - Block size is limited, it must be <= ZSTD_getBlockSizeMax() - * + If you need to compress more, cut data into multiple blocks - * + Consider using the regular ZSTD_compress() instead, as frame metadata - * costs become negligible when source size is large. - * - When a block is considered not compressible enough, ZSTD_compressBlock() - * result will be zero. In which case, nothing is produced into `dst`. - * + User must test for such outcome and deal directly with uncompressed data - * + ZSTD_decompressBlock() doesn't accept uncompressed data as input!!! - * + In case of multiple successive blocks, decoder must be informed of - * uncompressed block existence to follow proper history. Use - * ZSTD_insertBlock() in such a case. - ******************************************************************************/ +#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ -/* Define for static allocation */ -#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024) -/*===== Raw zstd block functions =====*/ -size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx); -size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); -size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, - const void *src, size_t srcSize); -size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart, - size_t blockSize); - -#endif /* ZSTD_H */ -- cgit v1.2.3 From fa443bc3c1e4b28d9315dea882e8358ba6e26f8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 29 Oct 2021 17:28:56 +0200 Subject: HID: intel-ish-hid: add support for MODULE_DEVICE_TABLE() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows to selectively autoload drivers for ISH devices. Currently all ISH drivers are loaded for all systems having any ISH device. Signed-off-by: Thomas Weißschuh Acked-by: Srinivas Pandruvada Acked-by: Hans de Goede Signed-off-by: Jiri Kosina --- include/linux/mod_devicetable.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index ae2e75d15b21..befbf53c4b7c 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -895,4 +895,17 @@ struct dfl_device_id { kernel_ulong_t driver_data; }; +/* ISHTP (Integrated Sensor Hub Transport Protocol) */ + +#define ISHTP_MODULE_PREFIX "ishtp:" + +/** + * struct ishtp_device_id - ISHTP device identifier + * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba + * @context: pointer to driver specific data + */ +struct ishtp_device_id { + guid_t guid; +}; + #endif /* LINUX_MOD_DEVICETABLE_H */ -- cgit v1.2.3 From 9ef4d0209cbadb63656a7aa29fde49c27ab2b9bf Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 9 Nov 2021 15:11:41 +0800 Subject: blk-mq: add one API for waiting until quiesce is done Some drivers(NVMe, SCSI) need to call quiesce and unquiesce in pair, but it is hard to switch to this style, so these drivers need one atomic flag for helping to balance quiesce and unquiesce. When quiesce is in-progress, the driver still needs to wait until the quiesce is done, so add API of blk_mq_wait_quiesce_done() for these drivers. Signed-off-by: Ming Lei Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20211109071144.181581-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b4039fdf1b04..d53ee59ba131 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -803,6 +803,7 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); +void blk_mq_wait_quiesce_done(struct request_queue *q); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); -- cgit v1.2.3 From 51b8c1fe250d1bd70c1722dc3c414f5cff2d7cca Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 8 Nov 2021 18:31:24 -0800 Subject: vfs: keep inodes with page cache off the inode shrinker LRU Historically (pre-2.5), the inode shrinker used to reclaim only empty inodes and skip over those that still contained page cache. This caused problems on highmem hosts: struct inode could put fill lowmem zones before the cache was getting reclaimed in the highmem zones. To address this, the inode shrinker started to strip page cache to facilitate reclaiming lowmem. However, this comes with its own set of problems: the shrinkers may drop actively used page cache just because the inodes are not currently open or dirty - think working with a large git tree. It further doesn't respect cgroup memory protection settings and can cause priority inversions between containers. Nowadays, the page cache also holds non-resident info for evicted cache pages in order to detect refaults. We've come to rely heavily on this data inside reclaim for protecting the cache workingset and driving swap behavior. We also use it to quantify and report workload health through psi. The latter in turn is used for fleet health monitoring, as well as driving automated memory sizing of workloads and containers, proactive reclaim and memory offloading schemes. The consequences of dropping page cache prematurely is that we're seeing subtle and not-so-subtle failures in all of the above-mentioned scenarios, with the workload generally entering unexpected thrashing states while losing the ability to reliably detect it. To fix this on non-highmem systems at least, going back to rotating inodes on the LRU isn't feasible. We've tried (commit a76cf1a474d7 ("mm: don't reclaim inodes with many attached pages")) and failed (commit 69056ee6a8a3 ("Revert "mm: don't reclaim inodes with many attached pages"")). The issue is mostly that shrinker pools attract pressure based on their size, and when objects get skipped the shrinkers remember this as deferred reclaim work. This accumulates excessive pressure on the remaining inodes, and we can quickly eat into heavily used ones, or dirty ones that require IO to reclaim, when there potentially is plenty of cold, clean cache around still. Instead, this patch keeps populated inodes off the inode LRU in the first place - just like an open file or dirty state would. An otherwise clean and unused inode then gets queued when the last cache entry disappears. This solves the problem without reintroducing the reclaim issues, and generally is a bit more scalable than having to wade through potentially hundreds of thousands of busy inodes. Locking is a bit tricky because the locks protecting the inode state (i_lock) and the inode LRU (lru_list.lock) don't nest inside the irq-safe page cache lock (i_pages.xa_lock). Page cache deletions are serialized through i_lock, taken before the i_pages lock, to make sure depopulated inodes are queued reliably. Additions may race with deletions, but we'll check again in the shrinker. If additions race with the shrinker itself, we're protected by the i_lock: if find_inode() or iput() win, the shrinker will bail on the elevated i_count or I_REFERENCED; if the shrinker wins and goes ahead with the inode, it will set I_FREEING and inhibit further igets(), which will cause the other side to create a new instance of the inode instead. Link: https://lkml.kernel.org/r/20210614211904.14420-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Roman Gushchin Cc: Tejun Heo Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + include/linux/pagemap.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 226de651f52e..de35a2640e70 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3193,6 +3193,7 @@ static inline void remove_inode_hash(struct inode *inode) } extern void inode_sb_list_add(struct inode *inode); +extern void inode_add_lru(struct inode *inode); extern int sb_set_blocksize(struct super_block *, int); extern int sb_min_blocksize(struct super_block *, int); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 62db6b0176b9..5c74a45ff97a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -23,6 +23,56 @@ static inline bool mapping_empty(struct address_space *mapping) return xa_empty(&mapping->i_pages); } +/* + * mapping_shrinkable - test if page cache state allows inode reclaim + * @mapping: the page cache mapping + * + * This checks the mapping's cache state for the pupose of inode + * reclaim and LRU management. + * + * The caller is expected to hold the i_lock, but is not required to + * hold the i_pages lock, which usually protects cache state. That's + * because the i_lock and the list_lru lock that protect the inode and + * its LRU state don't nest inside the irq-safe i_pages lock. + * + * Cache deletions are performed under the i_lock, which ensures that + * when an inode goes empty, it will reliably get queued on the LRU. + * + * Cache additions do not acquire the i_lock and may race with this + * check, in which case we'll report the inode as shrinkable when it + * has cache pages. This is okay: the shrinker also checks the + * refcount and the referenced bit, which will be elevated or set in + * the process of adding new cache pages to an inode. + */ +static inline bool mapping_shrinkable(struct address_space *mapping) +{ + void *head; + + /* + * On highmem systems, there could be lowmem pressure from the + * inodes before there is highmem pressure from the page + * cache. Make inodes shrinkable regardless of cache state. + */ + if (IS_ENABLED(CONFIG_HIGHMEM)) + return true; + + /* Cache completely empty? Shrink away. */ + head = rcu_access_pointer(mapping->i_pages.xa_head); + if (!head) + return true; + + /* + * The xarray stores single offset-0 entries directly in the + * head pointer, which allows non-resident page cache entries + * to escape the shadow shrinker's list of xarray nodes. The + * inode shrinker needs to pick them up under memory pressure. + */ + if (!xa_is_node(head) && xa_is_value(head)) + return true; + + return false; +} + /* * Bits in mapping->flags. */ -- cgit v1.2.3 From 83c1fd763b3220458652f7d656d5047292ebcde4 Mon Sep 17 00:00:00 2001 From: zhangyiru Date: Mon, 8 Nov 2021 18:31:27 -0800 Subject: mm,hugetlb: remove mlock ulimit for SHM_HUGETLB Commit 21a3c273f88c ("mm, hugetlb: add thread name and pid to SHM_HUGETLB mlock rlimit warning") marked this as deprecated in 2012, but it is not deleted yet. Mike says he still sees that message in log files on occasion, so maybe we should preserve this warning. Also remove hugetlbfs related user_shm_unlock in ipc/shm.c and remove the user_shm_unlock after out. Link: https://lkml.kernel.org/r/20211103105857.25041-1-zhangyiru3@huawei.com Signed-off-by: zhangyiru Reviewed-by: Mike Kravetz Cc: Hugh Dickins Cc: Liu Zixian Cc: Michal Hocko Cc: wuxu.wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 44c2ab0dfa59..00351ccb49a3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -477,8 +477,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, - struct ucounts **ucounts, int creat_flags, - int page_size_log); + int creat_flags, int page_size_log); static inline bool is_file_hugepages(struct file *file) { @@ -497,8 +496,7 @@ static inline struct hstate *hstate_inode(struct inode *i) #define is_file_hugepages(file) false static inline struct file * hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, - struct ucounts **ucounts, int creat_flags, - int page_size_log) + int creat_flags, int page_size_log) { return ERR_PTR(-ENOSYS); } -- cgit v1.2.3 From cc5f2704c93456e6f1c671c5cf7b582a55df5484 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 8 Nov 2021 18:31:48 -0800 Subject: proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore callbacks Let's support multiple registered callbacks, making sure that registering vmcore callbacks cannot fail. Make the callback return a bool instead of an int, handling how to deal with errors internally. Drop unused HAVE_OLDMEM_PFN_IS_RAM. We soon want to make use of this infrastructure from other drivers: virtio-mem, registering one callback for each virtio-mem device, to prevent reading unplugged virtio-mem memory. Handle it via a generic vmcore_cb structure, prepared for future extensions: for example, once we support virtio-mem on s390x where the vmcore is completely constructed in the second kernel, we want to detect and add plugged virtio-mem memory ranges to the vmcore in order for them to get dumped properly. Handle corner cases that are unexpected and shouldn't happen in sane setups: registering a callback after the vmcore has already been opened (warn only) and unregistering a callback after the vmcore has already been opened (warn and essentially read only zeroes from that point on). Link: https://lkml.kernel.org/r/20211005121430.30136-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Wang Cc: Juergen Gross Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_dump.h | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 2618577a4d6d..0c547d866f1e 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -91,9 +91,29 @@ static inline void vmcore_unusable(void) elfcorehdr_addr = ELFCORE_ADDR_ERR; } -#define HAVE_OLDMEM_PFN_IS_RAM 1 -extern int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)); -extern void unregister_oldmem_pfn_is_ram(void); +/** + * struct vmcore_cb - driver callbacks for /proc/vmcore handling + * @pfn_is_ram: check whether a PFN really is RAM and should be accessed when + * reading the vmcore. Will return "true" if it is RAM or if the + * callback cannot tell. If any callback returns "false", it's not + * RAM and the page must not be accessed; zeroes should be + * indicated in the vmcore instead. For example, a ballooned page + * contains no data and reading from such a page will cause high + * load in the hypervisor. + * @next: List head to manage registered callbacks internally; initialized by + * register_vmcore_cb(). + * + * vmcore callbacks allow drivers managing physical memory ranges to + * coordinate with vmcore handling code, for example, to prevent accessing + * physical memory ranges that should not be accessed when reading the vmcore, + * although included in the vmcore header as memory ranges to dump. + */ +struct vmcore_cb { + bool (*pfn_is_ram)(struct vmcore_cb *cb, unsigned long pfn); + struct list_head next; +}; +extern void register_vmcore_cb(struct vmcore_cb *cb); +extern void unregister_vmcore_cb(struct vmcore_cb *cb); #else /* !CONFIG_CRASH_DUMP */ static inline bool is_kdump_kernel(void) { return 0; } -- cgit v1.2.3 From f5d80614844a725cbf66fd6524e01c218ede2141 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 8 Nov 2021 18:32:08 -0800 Subject: kernel.h: drop unneeded inclusion from other headers Patch series "kernel.h further split", v5. kernel.h is a set of something which is not related to each other and often used in non-crossed compilation units, especially when drivers need only one or two macro definitions from it. This patch (of 7): There is no evidence we need kernel.h inclusion in certain headers. Drop unneeded inclusion from other headers. [sfr@canb.auug.org.au: bottom_half.h needs kernel] Link: https://lkml.kernel.org/r/20211015202908.1c417ae2@canb.auug.org.au Link: https://lkml.kernel.org/r/20211013170417.87909-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20211013170417.87909-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Stephen Rothwell Cc: Thomas Gleixner Cc: Brendan Higgins Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Cc: Waiman Long Cc: Boqun Feng Cc: Sakari Ailus Cc: Laurent Pinchart Cc: Mauro Carvalho Chehab Cc: Miguel Ojeda Cc: Jonathan Cameron Cc: Rasmus Villemoes Cc: Thorsten Leemhuis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bottom_half.h | 1 + include/linux/rwsem.h | 1 - include/linux/smp.h | 1 - include/linux/spinlock.h | 1 - 4 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index eed86eb0a1de..11d107d88d03 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -2,6 +2,7 @@ #ifndef _LINUX_BH_H #define _LINUX_BH_H +#include #include #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 352c6127cb90..f9348769e558 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -11,7 +11,6 @@ #include #include -#include #include #include #include diff --git a/include/linux/smp.h b/include/linux/smp.h index 510519e8a1eb..a80ab58ae3f1 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -108,7 +108,6 @@ static inline void on_each_cpu_cond(smp_cond_func_t cond_func, #ifdef CONFIG_SMP #include -#include #include #include #include diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 45310ea1b1d7..22d672f81705 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -57,7 +57,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From d2a8ebbf8192b84b11f1b204c4f7c602df32aeac Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 8 Nov 2021 18:32:12 -0800 Subject: kernel.h: split out container_of() and typeof_member() macros kernel.h is being used as a dump for all kinds of stuff for a long time. Here is the attempt cleaning it up by splitting out container_of() and typeof_member() macros. For time being include new header back to kernel.h to avoid twisted indirected includes for existing users. Note, there are _a lot_ of headers and modules that include kernel.h solely for one of these macros and this allows to unburden compiler for the twisted inclusion paths and to make new code cleaner in the future. Link: https://lkml.kernel.org/r/20211013170417.87909-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Boqun Feng Cc: Brendan Higgins Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Laurent Pinchart Cc: Mauro Carvalho Chehab Cc: Miguel Ojeda Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Sakari Ailus Cc: Thomas Gleixner Cc: Thorsten Leemhuis Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/container_of.h | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/kernel.h | 33 +-------------------------------- 2 files changed, 41 insertions(+), 32 deletions(-) create mode 100644 include/linux/container_of.h (limited to 'include/linux') diff --git a/include/linux/container_of.h b/include/linux/container_of.h new file mode 100644 index 000000000000..dd56019838c6 --- /dev/null +++ b/include/linux/container_of.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CONTAINER_OF_H +#define _LINUX_CONTAINER_OF_H + +#include +#include + +#define typeof_member(T, m) typeof(((T*)0)->m) + +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) ({ \ + void *__mptr = (void *)(ptr); \ + BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \ + !__same_type(*(ptr), void), \ + "pointer type mismatch in container_of()"); \ + ((type *)(__mptr - offsetof(type, member))); }) + +/** + * container_of_safe - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + * If IS_ERR_OR_NULL(ptr), ptr is returned unchanged. + */ +#define container_of_safe(ptr, type, member) ({ \ + void *__mptr = (void *)(ptr); \ + BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \ + !__same_type(*(ptr), void), \ + "pointer type mismatch in container_of()"); \ + IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) : \ + ((type *)(__mptr - offsetof(type, member))); }) + +#endif /* _LINUX_CONTAINER_OF_H */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 471bc0593679..ed4465757cec 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -52,8 +53,6 @@ } \ ) -#define typeof_member(T, m) typeof(((T*)0)->m) - #define _RET_IP_ (unsigned long)__builtin_return_address(0) #define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) @@ -484,36 +483,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #define __CONCAT(a, b) a ## b #define CONCATENATE(a, b) __CONCAT(a, b) -/** - * container_of - cast a member of a structure out to the containing structure - * @ptr: the pointer to the member. - * @type: the type of the container struct this is embedded in. - * @member: the name of the member within the struct. - * - */ -#define container_of(ptr, type, member) ({ \ - void *__mptr = (void *)(ptr); \ - BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \ - !__same_type(*(ptr), void), \ - "pointer type mismatch in container_of()"); \ - ((type *)(__mptr - offsetof(type, member))); }) - -/** - * container_of_safe - cast a member of a structure out to the containing structure - * @ptr: the pointer to the member. - * @type: the type of the container struct this is embedded in. - * @member: the name of the member within the struct. - * - * If IS_ERR_OR_NULL(ptr), ptr is returned unchanged. - */ -#define container_of_safe(ptr, type, member) ({ \ - void *__mptr = (void *)(ptr); \ - BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \ - !__same_type(*(ptr), void), \ - "pointer type mismatch in container_of()"); \ - IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) : \ - ((type *)(__mptr - offsetof(type, member))); }) - /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ #ifdef CONFIG_FTRACE_MCOUNT_RECORD # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD -- cgit v1.2.3 From cd7187e112c91186185c0553b1f9d033ed399d3a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 8 Nov 2021 18:32:19 -0800 Subject: include/linux/list.h: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Link: https://lkml.kernel.org/r/20211013170417.87909-5-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Boqun Feng Cc: Brendan Higgins Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Laurent Pinchart Cc: Mauro Carvalho Chehab Cc: Miguel Ojeda Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Sakari Ailus Cc: Thomas Gleixner Cc: Thorsten Leemhuis Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index f2af4b4aa4e9..6636fc07f918 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -2,11 +2,13 @@ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H +#include #include #include #include #include -#include + +#include /* * Circular doubly linked list implementation. -- cgit v1.2.3 From 50b09d6145da04e19fa448670e4918ce496f1c77 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 8 Nov 2021 18:32:22 -0800 Subject: include/linux/llist.h: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Link: https://lkml.kernel.org/r/20211013170417.87909-6-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Boqun Feng Cc: Brendan Higgins Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Laurent Pinchart Cc: Mauro Carvalho Chehab Cc: Miguel Ojeda Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Sakari Ailus Cc: Thomas Gleixner Cc: Thorsten Leemhuis Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/llist.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/llist.h b/include/linux/llist.h index 24f207b0190b..85bda2d02d65 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -49,7 +49,9 @@ */ #include -#include +#include +#include +#include struct llist_head { struct llist_node *first; -- cgit v1.2.3 From c540f9595956ff3c4b2cca1145c893941f921bcd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 8 Nov 2021 18:32:25 -0800 Subject: include/linux/plist.h: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Link: https://lkml.kernel.org/r/20211013170417.87909-7-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Boqun Feng Cc: Brendan Higgins Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Laurent Pinchart Cc: Mauro Carvalho Chehab Cc: Miguel Ojeda Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Sakari Ailus Cc: Thomas Gleixner Cc: Thorsten Leemhuis Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/plist.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/plist.h b/include/linux/plist.h index 66bab1bca35c..0f352c1d3c80 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -73,8 +73,11 @@ #ifndef _LINUX_PLIST_H_ #define _LINUX_PLIST_H_ -#include +#include #include +#include + +#include struct plist_head { struct list_head node_list; -- cgit v1.2.3 From 5f6286a608107a68646241b57d2bd2a4fc139ef9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 8 Nov 2021 18:32:32 -0800 Subject: include/linux/delay.h: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. [akpm@linux-foundation.org: cxd2880_common.h needs bits.h for GENMASK()] [andriy.shevchenko@linux.intel.com: delay.h: fix for removed kernel.h] Link: https://lkml.kernel.org/r/20211028170143.56523-1-andriy.shevchenko@linux.intel.com [akpm@linux-foundation.org: include/linux/fwnode.h needs bits.h for BIT()] Link: https://lkml.kernel.org/r/20211027150324.79827-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delay.h | 2 +- include/linux/fwnode.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/delay.h b/include/linux/delay.h index 1d0e2ce6b6d9..8eacf67eb212 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -19,7 +19,7 @@ * https://lists.openwall.net/linux-kernel/2011/01/09/56 */ -#include +#include extern unsigned long loops_per_jiffy; diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 9f4ad719bfe3..3a532ba66f6c 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -11,6 +11,7 @@ #include #include +#include #include struct fwnode_operations; -- cgit v1.2.3 From 1fcbd5deac51f30a7ed3f10bea422d08a2b3aba6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 8 Nov 2021 18:32:35 -0800 Subject: include/linux/sbitmap.h: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Link: https://lkml.kernel.org/r/20211027150437.79921-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sbitmap.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 2713e689ad66..8121034d1f13 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -9,8 +9,17 @@ #ifndef __LINUX_SCALE_BITMAP_H #define __LINUX_SCALE_BITMAP_H -#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include struct seq_file; -- cgit v1.2.3 From 98e1385ef24bd607586fbf44e73decd5112d3d4a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 8 Nov 2021 18:32:38 -0800 Subject: include/linux/radix-tree.h: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Link: https://lkml.kernel.org/r/20211027150528.80003-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 64ad900ac742..f7c1d21c2f39 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -9,8 +9,10 @@ #define _LINUX_RADIX_TREE_H #include -#include +#include #include +#include +#include #include #include #include -- cgit v1.2.3 From b4b87651104dd32ab258d097d47b97e243a95222 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 8 Nov 2021 18:32:41 -0800 Subject: include/linux/generic-radix-tree.h: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. [akpm@linux-foundation.org: include math.h for round_up()] Link: https://lkml.kernel.org/r/20211027150548.80042-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/generic-radix-tree.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index bfd00320c7f3..107613f7d792 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -38,8 +38,9 @@ #include #include -#include #include +#include +#include struct genradix_root; -- cgit v1.2.3 From e52340de11d8bca3ba35e5a72fea4f2a5b6abbbb Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 8 Nov 2021 18:32:43 -0800 Subject: kernel.h: split out instruction pointer accessors bottom_half.h needs _THIS_IP_ to be standalone, so split that and _RET_IP_ out from kernel.h into the new instruction_pointer.h. kernel.h directly needs them, so include it there and replace the include of kernel.h with this new file in bottom_half.h. Link: https://lkml.kernel.org/r/20211028161248.45232-1-andriy.shevchenko@linux.intel.com Signed-off-by: Stephen Rothwell Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bottom_half.h | 2 +- include/linux/instruction_pointer.h | 8 ++++++++ include/linux/kernel.h | 4 +--- 3 files changed, 10 insertions(+), 4 deletions(-) create mode 100644 include/linux/instruction_pointer.h (limited to 'include/linux') diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 11d107d88d03..fc53e0ad56d9 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -2,7 +2,7 @@ #ifndef _LINUX_BH_H #define _LINUX_BH_H -#include +#include #include #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS) diff --git a/include/linux/instruction_pointer.h b/include/linux/instruction_pointer.h new file mode 100644 index 000000000000..cda1f706eaeb --- /dev/null +++ b/include/linux/instruction_pointer.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_INSTRUCTION_POINTER_H +#define _LINUX_INSTRUCTION_POINTER_H + +#define _RET_IP_ (unsigned long)__builtin_return_address(0) +#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) + +#endif /* _LINUX_INSTRUCTION_POINTER_H */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index ed4465757cec..46ca4404fb93 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -53,9 +54,6 @@ } \ ) -#define _RET_IP_ (unsigned long)__builtin_return_address(0) -#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) - /** * upper_32_bits - return bits 32-63 of a number * @n: the number we're accessing -- cgit v1.2.3 From e1edc277e6f6dfb372216522dfc57f9381c39e35 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 8 Nov 2021 18:32:46 -0800 Subject: linux/container_of.h: switch to static_assert _Static_assert() is evaluated already in the compiler's frontend, and gives a somehat more to-the-point error, compared to the BUILD_BUG_ON macro, which only fires after the optimizer has had a chance to eliminate calls to functions marked with __attribute__((error)). In theory, this might make builds a tiny bit faster. There's also a little less gunk in the error message emitted: lib/sort.c: In function `foo': include/linux/build_bug.h:78:41: error: static assertion failed: "pointer type mismatch in container_of()" 78 | #define __static_assert(expr, msg, ...) _Static_assert(expr, msg) compared to lib/sort.c: In function `foo': include/linux/compiler_types.h:322:38: error: call to `__compiletime_assert_2' declared with attribute error: pointer type mismatch in container_of() 322 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) While at it, fix the copy-pasto in container_of_safe(). Link: https://lkml.kernel.org/r/20211015090530.2774079-1-linux@rasmusvillemoes.dk Link: https://lore.kernel.org/lkml/20211014132331.GA4811@kernel.org/T/ Signed-off-by: Rasmus Villemoes Reviewed-by: Miguel Ojeda Acked-by: Andy Shevchenko Reviewed-by: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/container_of.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/container_of.h b/include/linux/container_of.h index dd56019838c6..2f4944b791b8 100644 --- a/include/linux/container_of.h +++ b/include/linux/container_of.h @@ -16,9 +16,9 @@ */ #define container_of(ptr, type, member) ({ \ void *__mptr = (void *)(ptr); \ - BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \ - !__same_type(*(ptr), void), \ - "pointer type mismatch in container_of()"); \ + static_assert(__same_type(*(ptr), ((type *)0)->member) || \ + __same_type(*(ptr), void), \ + "pointer type mismatch in container_of()"); \ ((type *)(__mptr - offsetof(type, member))); }) /** @@ -31,9 +31,9 @@ */ #define container_of_safe(ptr, type, member) ({ \ void *__mptr = (void *)(ptr); \ - BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \ - !__same_type(*(ptr), void), \ - "pointer type mismatch in container_of()"); \ + static_assert(__same_type(*(ptr), ((type *)0)->member) || \ + __same_type(*(ptr), void), \ + "pointer type mismatch in container_of_safe()"); \ IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) : \ ((type *)(__mptr - offsetof(type, member))); }) -- cgit v1.2.3 From 505be48165fa2f2cb795659b4fc61f35563d105e Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Mon, 8 Nov 2021 18:33:12 -0800 Subject: lib, stackdepot: add helper to print stack entries To print a stack entries, users of stackdepot, first use stack_depot_fetch to get a list of stack entries and then use stack_trace_print to print this list. Provide a helper in stackdepot to print stack entries based on stackdepot handle. Also change above mentioned users to use this helper. Link: https://lkml.kernel.org/r/20210915014806.3206938-3-imran.f.khan@oracle.com Signed-off-by: Imran Khan Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Reviewed-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Daniel Vetter Cc: David Airlie Cc: Dmitry Vyukov Cc: Geert Uytterhoeven Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackdepot.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index d29860966bc9..8ab37500fcba 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -25,6 +25,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +void stack_depot_print(depot_stack_handle_t stack); + #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); #else -- cgit v1.2.3 From 0f68d45ef41abb618a9ca33996348ae73800a106 Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Mon, 8 Nov 2021 18:33:16 -0800 Subject: lib, stackdepot: add helper to print stack entries into buffer To print stack entries into a buffer, users of stackdepot, first get a list of stack entries using stack_depot_fetch and then print this list into a buffer using stack_trace_snprint. Provide a helper in stackdepot for this purpose. Also change above mentioned users to use this helper. [imran.f.khan@oracle.com: fix build error] Link: https://lkml.kernel.org/r/20210915175321.3472770-4-imran.f.khan@oracle.com [imran.f.khan@oracle.com: export stack_depot_snprint() to modules] Link: https://lkml.kernel.org/r/20210916133535.3592491-4-imran.f.khan@oracle.com Link: https://lkml.kernel.org/r/20210915014806.3206938-4-imran.f.khan@oracle.com Signed-off-by: Imran Khan Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Acked-by: Jani Nikula [i915] Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Daniel Vetter Cc: David Airlie Cc: Dmitry Vyukov Cc: Geert Uytterhoeven Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackdepot.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 8ab37500fcba..c34b55a6e554 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -25,6 +25,9 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, + int spaces); + void stack_depot_print(depot_stack_handle_t stack); #ifdef CONFIG_STACKDEPOT -- cgit v1.2.3 From bfb3ba32061da1a9217ef6d02fbcffb528e4c8df Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Mon, 8 Nov 2021 18:33:19 -0800 Subject: include/linux/string_helpers.h: add linux/string.h for strlen() linux/string_helpers.h uses strlen(), so include the correponding header. Otherwise we get a compilation error if it's not also included by whoever included the helper. Link: https://lkml.kernel.org/r/20211005212634.3223113-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string_helpers.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 68189c4a2eb1..4ba39e1403b2 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -4,6 +4,7 @@ #include #include +#include #include struct file; -- cgit v1.2.3 From 1b1ad288b8f1b11f83396e537003722897ecc12b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 8 Nov 2021 18:33:43 -0800 Subject: kallsyms: remove arch specific text and data check Patch series "sections: Unify kernel sections range check and use", v4. There are three head files(kallsyms.h, kernel.h and sections.h) which include the kernel sections range check, let's make some cleanup and unify them. 1. cleanup arch specific text/data check and fix address boundary check in kallsyms.h 2. make all the basic/core kernel range check function into sections.h 3. update all the callers, and use the helper in sections.h to simplify the code After this series, we have 5 APIs about kernel sections range check in sections.h * is_kernel_rodata() --- already in sections.h * is_kernel_core_data() --- come from core_kernel_data() in kernel.h * is_kernel_inittext() --- come from kernel.h and kallsyms.h * __is_kernel_text() --- add new internal helper * __is_kernel() --- add new internal helper Note: For the last two helpers, people should not use directly, consider to use corresponding function in kallsyms.h. This patch (of 11): Remove arch specific text and data check after commit 4ba66a976072 ("arch: remove blackfin port"), no need arch-specific text/data check. Link: https://lkml.kernel.org/r/20210930071143.63410-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20210930071143.63410-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Cc: Arnd Bergmann Cc: Steven Rostedt Cc: Ingo Molnar Cc: David S. Miller Cc: Alexei Starovoitov Cc: Andrey Ryabinin Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christophe Leroy Cc: Kefeng Wang Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michal Simek Cc: Petr Mladek Cc: Richard Henderson Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kallsyms.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index a1d6fc82d7f0..bac340972670 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -34,8 +34,7 @@ static inline int is_kernel_inittext(unsigned long addr) static inline int is_kernel_text(unsigned long addr) { - if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || - arch_is_kernel_text(addr)) + if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)) return 1; return in_gate_area_no_mm(addr); } -- cgit v1.2.3 From e7d5c4b0eb9be6ea5c2cfb510a9d6f56925118eb Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 8 Nov 2021 18:33:47 -0800 Subject: kallsyms: fix address-checks for kernel related range The is_kernel_inittext/is_kernel_text/is_kernel function should not include the end address(the labels _einittext, _etext and _end) when check the address range, the issue exists since Linux v2.6.12. Link: https://lkml.kernel.org/r/20210930071143.63410-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Petr Mladek Reviewed-by: Steven Rostedt (VMware) Acked-by: Sergey Senozhatsky Cc: Arnd Bergmann Cc: Alexander Potapenko Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: "David S. Miller" Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Simek Cc: Paul Mackerras Cc: Richard Henderson Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kallsyms.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index bac340972670..62ce22b27ea8 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -27,21 +27,21 @@ struct module; static inline int is_kernel_inittext(unsigned long addr) { if (addr >= (unsigned long)_sinittext - && addr <= (unsigned long)_einittext) + && addr < (unsigned long)_einittext) return 1; return 0; } static inline int is_kernel_text(unsigned long addr) { - if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)) + if ((addr >= (unsigned long)_stext && addr < (unsigned long)_etext)) return 1; return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) + if (addr >= (unsigned long)_stext && addr < (unsigned long)_end) return 1; return in_gate_area_no_mm(addr); } -- cgit v1.2.3 From a20deb3a348719adaf8c12e1bf4b599bfc51836e Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 8 Nov 2021 18:33:51 -0800 Subject: sections: move and rename core_kernel_data() to is_kernel_core_data() Move core_kernel_data() into sections.h and rename it to is_kernel_core_data(), also make it return bool value, then update all the callers. Link: https://lkml.kernel.org/r/20210930071143.63410-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Cc: Arnd Bergmann Cc: Steven Rostedt Cc: Ingo Molnar Cc: "David S. Miller" Cc: Alexander Potapenko Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dmitry Vyukov Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Simek Cc: Paul Mackerras Cc: Petr Mladek Cc: Richard Henderson Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 46ca4404fb93..23f57a2d5a13 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -227,7 +227,6 @@ extern char *next_arg(char *args, char **param, char **val); extern int core_kernel_text(unsigned long addr); extern int init_kernel_text(unsigned long addr); -extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); extern int func_ptr_is_kernel_text(void *ptr); -- cgit v1.2.3 From b9ad8fe7b8cab3814d1de41e8ddca602b2646f4b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 8 Nov 2021 18:33:54 -0800 Subject: sections: move is_kernel_inittext() into sections.h The is_kernel_inittext() and init_kernel_text() are with same functionality, let's just keep is_kernel_inittext() and move it into sections.h, then update all the callers. Link: https://lkml.kernel.org/r/20210930071143.63410-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Alexander Potapenko Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: "David S. Miller" Cc: Dmitry Vyukov Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Simek Cc: Paul Mackerras Cc: Petr Mladek Cc: Richard Henderson Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kallsyms.h | 8 -------- include/linux/kernel.h | 1 - 2 files changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 62ce22b27ea8..bb5d9ec78e13 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -24,14 +24,6 @@ struct cred; struct module; -static inline int is_kernel_inittext(unsigned long addr) -{ - if (addr >= (unsigned long)_sinittext - && addr < (unsigned long)_einittext) - return 1; - return 0; -} - static inline int is_kernel_text(unsigned long addr) { if ((addr >= (unsigned long)_stext && addr < (unsigned long)_etext)) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 23f57a2d5a13..be84ab369650 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -226,7 +226,6 @@ extern bool parse_option_str(const char *str, const char *option); extern char *next_arg(char *args, char **param, char **val); extern int core_kernel_text(unsigned long addr); -extern int init_kernel_text(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); extern int func_ptr_is_kernel_text(void *ptr); -- cgit v1.2.3 From 8f6e42e83362650007ed298070a69d084f2c8baf Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 8 Nov 2021 18:34:02 -0800 Subject: sections: provide internal __is_kernel() and __is_kernel_text() helper An internal __is_kernel() helper which only check the kernel address ranges, and an internal __is_kernel_text() helper which only check text section ranges. Link: https://lkml.kernel.org/r/20210930071143.63410-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Cc: Alexander Potapenko Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: "David S. Miller" Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Simek Cc: Paul Mackerras Cc: Petr Mladek Cc: Richard Henderson Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kallsyms.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index bb5d9ec78e13..4176c7eca7b5 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -26,14 +26,14 @@ struct module; static inline int is_kernel_text(unsigned long addr) { - if ((addr >= (unsigned long)_stext && addr < (unsigned long)_etext)) + if (__is_kernel_text(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { - if (addr >= (unsigned long)_stext && addr < (unsigned long)_end) + if (__is_kernel(addr)) return 1; return in_gate_area_no_mm(addr); } -- cgit v1.2.3 From 5605f41917c6ad766814a8e417a4481e9157c991 Mon Sep 17 00:00:00 2001 From: Changcheng Deng Date: Mon, 8 Nov 2021 18:35:07 -0800 Subject: crash_dump: fix boolreturn.cocci warning ./include/linux/crash_dump.h: 119: 50-51: WARNING: return of 0/1 in function 'is_kdump_kernel' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Link: https://lkml.kernel.org/r/20211020083905.1037952-1-deng.changcheng@zte.com.cn Signed-off-by: Changcheng Deng Reported-by: Zeal Robot Reviewed-by: Simon Horman Acked-by: Baoquan He Cc: Dave Young Cc: Mike Rapoport Cc: Vivek Goyal Cc: Ye Guojin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_dump.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 0c547d866f1e..979c26176c1d 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -116,7 +116,7 @@ extern void register_vmcore_cb(struct vmcore_cb *cb); extern void unregister_vmcore_cb(struct vmcore_cb *cb); #else /* !CONFIG_CRASH_DUMP */ -static inline bool is_kdump_kernel(void) { return 0; } +static inline bool is_kdump_kernel(void) { return false; } #endif /* CONFIG_CRASH_DUMP */ /* Device Dump information to be filled by drivers */ -- cgit v1.2.3 From a10677a028b853c25054a4b8be04013ccb55682f Mon Sep 17 00:00:00 2001 From: Ye Guojin Date: Mon, 8 Nov 2021 18:35:10 -0800 Subject: crash_dump: remove duplicate include in crash_dump.h In crash_dump.h, header file is included twice. This duplication was introduced in commit 65fddcfca8ad("mm: reorder includes after introduction of linux/pgtable.h") where the order of the header files is adjusted, while the old one was not removed. Clean it up here. Link: https://lkml.kernel.org/r/20211020090659.1038877-1-ye.guojin@zte.com.cn Signed-off-by: Ye Guojin Reported-by: Zeal Robot Acked-by: Baoquan He Cc: Dave Young Cc: Mike Rapoport Cc: Vivek Goyal Cc: Changcheng Deng Cc: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_dump.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 979c26176c1d..620821549b23 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -8,8 +8,6 @@ #include #include -#include /* for pgprot_t */ - /* For IS_ENABLED(CONFIG_CRASH_DUMP) */ #define ELFCORE_ADDR_MAX (-1ULL) #define ELFCORE_ADDR_ERR (-2ULL) -- cgit v1.2.3 From f26663684e76773ea86e2df13fb18f9d66c91151 Mon Sep 17 00:00:00 2001 From: Ye Guojin Date: Mon, 8 Nov 2021 18:35:13 -0800 Subject: signal: remove duplicate include in signal.h 'linux/string.h' included in 'signal.h' is duplicated. it's also included at line 7. Link: https://lkml.kernel.org/r/20211019024934.973008-1-ye.guojin@zte.com.cn Signed-off-by: Ye Guojin Reported-by: Zeal Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/signal.h b/include/linux/signal.h index 3f96a6374e4f..37f5080c070a 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -126,7 +126,6 @@ static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2) #define sigmask(sig) (1UL << ((sig) - 1)) #ifndef __HAVE_ARCH_SIG_SETOPS -#include #define _SIG_SET_BINOP(name, op) \ static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \ -- cgit v1.2.3 From 372904c080be44629d84bb15ed5e12eed44b5f9f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 8 Nov 2021 18:35:16 -0800 Subject: seq_file: move seq_escape() to a header Move seq_escape() to the header as inliner, for a small kernel text size reduction. Link: https://lkml.kernel.org/r/20211001122917.67228-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/seq_file.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index dd99569595fd..103776e18555 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -135,7 +136,21 @@ static inline void seq_escape_str(struct seq_file *m, const char *src, seq_escape_mem(m, src, strlen(src), flags, esc); } -void seq_escape(struct seq_file *m, const char *s, const char *esc); +/** + * seq_escape - print string into buffer, escaping some characters + * @m: target buffer + * @s: NULL-terminated string + * @esc: set of characters that need escaping + * + * Puts string into buffer, replacing each occurrence of character from + * @esc with usual octal escape. + * + * Use seq_has_overflowed() to check for errors. + */ +static inline void seq_escape(struct seq_file *m, const char *s, const char *esc) +{ + seq_escape_str(m, s, ESCAPE_OCTAL, esc); +} void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, -- cgit v1.2.3 From 10a6de19cad6efb9b49883513afb810dc265fca2 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 8 Nov 2021 18:35:19 -0800 Subject: seq_file: fix passing wrong private data DEFINE_PROC_SHOW_ATTRIBUTE() is supposed to be used to define a series of functions and variables to register proc file easily. And the users can use proc_create_data() to pass their own private data and get it via seq->private in the callback. Unfortunately, the proc file system use PDE_DATA() to get private data instead of inode->i_private. So fix it. Fortunately, there only one user of it which does not pass any private data, so this bug does not break any in-tree codes. Link: https://lkml.kernel.org/r/20211029032638.84884-1-songmuchun@bytedance.com Fixes: 97a32539b956 ("proc: convert everything to "struct proc_ops"") Signed-off-by: Muchun Song Cc: Andy Shevchenko Cc: Stephen Rothwell Cc: Florent Revest Cc: Alexey Dobriyan Cc: Christian Brauner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/seq_file.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 103776e18555..72dbb44a4573 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -209,7 +209,7 @@ static const struct file_operations __name ## _fops = { \ #define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ - return single_open(file, __name ## _show, inode->i_private); \ + return single_open(file, __name ## _show, PDE_DATA(inode)); \ } \ \ static const struct proc_ops __name ## _proc_ops = { \ -- cgit v1.2.3 From 278167fd2f8ffe679351605fe03e29ff3ab8db18 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Tue, 9 Nov 2021 16:29:49 -0800 Subject: block: add __must_check for *add_disk*() callers Now that we have done a spring cleaning on all drivers and added error checking / handling, let's keep it that way and ensure no new drivers fail to stick with it. Reviewed-by: Christoph Hellwig Signed-off-by: Luis Chamberlain Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20211110002949.999380-1-mcgrof@kernel.org Signed-off-by: Jens Axboe --- include/linux/genhd.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 462634b4b48f..74c410263113 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -205,9 +205,9 @@ static inline dev_t disk_devt(struct gendisk *disk) void disk_uevent(struct gendisk *disk, enum kobject_action action); /* block/genhd.c */ -int device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups); -static inline int add_disk(struct gendisk *disk) +int __must_check device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups); +static inline int __must_check add_disk(struct gendisk *disk) { return device_add_disk(NULL, disk, NULL); } -- cgit v1.2.3 From 64355db3caf6468dc711995239efe0cbcd7d0091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 10 Nov 2021 13:16:55 +0100 Subject: mod_devicetable: fix kdocs for ishtp_device_id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kdocs were copied from another device_id struct and not adapted. Fixes: fa443bc3c1e4 ("HID: intel-ish-hid: add support for MODULE_DEVICE_TABLE()") Signed-off-by: Thomas Weißschuh Reported-by: Stephen Rothwell Signed-off-by: Jiri Kosina --- include/linux/mod_devicetable.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index befbf53c4b7c..c70abe7aaef2 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -901,8 +901,7 @@ struct dfl_device_id { /** * struct ishtp_device_id - ISHTP device identifier - * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba - * @context: pointer to driver specific data + * @guid: GUID of the device. */ struct ishtp_device_id { guid_t guid; -- cgit v1.2.3 From 5d5e4522a7f404d1a96fd6c703989d32a9c9568d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 7 Nov 2021 14:51:16 +1000 Subject: printk: restore flushing of NMI buffers on remote CPUs after NMI backtraces printk from NMI context relies on irq work being raised on the local CPU to print to console. This can be a problem if the NMI was raised by a lockup detector to print lockup stack and regs, because the CPU may not enable irqs (because it is locked up). Introduce printk_trigger_flush() that can be called another CPU to try to get those messages to the console, call that where printk_safe_flush was previously called. Fixes: 93d102f094be ("printk: remove safe buffers") Cc: stable@vger.kernel.org # 5.15 Signed-off-by: Nicholas Piggin Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211107045116.1754411-1-npiggin@gmail.com --- include/linux/printk.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index a1379df43251..596ad6fa0336 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -206,6 +206,7 @@ void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; +void printk_trigger_flush(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -282,6 +283,9 @@ static inline void dump_stack_lvl(const char *log_lvl) static inline void dump_stack(void) { } +static inline void printk_trigger_flush(void) +{ +} #endif #ifdef CONFIG_SMP -- cgit v1.2.3 From a19672f6b9718df247a9087a266150dbe833022e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Aug 2021 21:54:58 +0100 Subject: folio: Add a function to change the private data attached to a folio Add a function, folio_change_private(), that will change the private data attached to a folio, without the need to twiddle the private bit or the refcount. It assumes that folio_add_private() has already been called on the page. Signed-off-by: David Howells Reviewed-by: Matthew Wilcox (Oracle) Tested-by: Jeff Layton Tested-by: Dominique Martinet Tested-by: kafs-testing@auristor.com Link: https://lore.kernel.org/r/162981149911.1901565.17776700811659843340.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/163005743485.2472992.5100702469503007023.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163584180781.4023316.5037526301198034310.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/163649324326.309189.17817587229450840783.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/163657848531.834781.14269656212269187893.stgit@warthog.procyon.org.uk/ # v5 --- include/linux/pagemap.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 6a30916b76e5..1f560aecd9b5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -279,6 +279,25 @@ static inline void folio_attach_private(struct folio *folio, void *data) folio_set_private(folio); } +/** + * folio_change_private - Change private data on a folio. + * @folio: Folio to change the data on. + * @data: Data to set on the folio. + * + * Change the private data attached to a folio and return the old + * data. The page must previously have had data attached and the data + * must be detached before the folio will be freed. + * + * Return: Data that was previously attached to the folio. + */ +static inline void *folio_change_private(struct folio *folio, void *data) +{ + void *old = folio_get_private(folio); + + folio->private = data; + return old; +} + /** * folio_detach_private - Detach private data from a folio. * @folio: Folio to detach data from. -- cgit v1.2.3 From 452c472e26348df1e7052544130aa98eebbd2331 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Aug 2021 22:09:57 +0100 Subject: folio: Add a function to get the host inode for a folio Add a convenience function, folio_inode() that will get the host inode from a folio's mapping. Changes: ver #3: - Fix mistake in function description[2]. ver #2: - Fix contradiction between doc and implementation by disallowing use with swap caches[1]. Signed-off-by: David Howells Reviewed-by: Matthew Wilcox (Oracle) Tested-by: Jeff Layton Tested-by: Dominique Martinet Tested-by: kafs-testing@auristor.com Link: https://lore.kernel.org/r/YST8OcVNy02Rivbm@casper.infradead.org/ [1] Link: https://lore.kernel.org/r/YYKLkBwQdtn4ja+i@casper.infradead.org/ [2] Link: https://lore.kernel.org/r/162880453171.3369675.3704943108660112470.stgit@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/162981151155.1901565.7010079316994382707.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/163005744370.2472992.18324470937328925723.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163584184628.4023316.9386282630968981869.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/163649325519.309189.15072332908703129455.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/163657850401.834781.1031963517399283294.stgit@warthog.procyon.org.uk/ # v5 --- include/linux/pagemap.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 1f560aecd9b5..1a0c646eb6ff 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -253,6 +253,20 @@ static inline struct address_space *page_mapping_file(struct page *page) return folio_mapping(folio); } +/** + * folio_inode - Get the host inode for this folio. + * @folio: The folio. + * + * For folios which are in the page cache, return the inode that this folio + * belongs to. + * + * Do not call this for folios which aren't in the page cache. + */ +static inline struct inode *folio_inode(struct folio *folio) +{ + return folio->mapping->host; +} + static inline bool page_cache_add_speculative(struct page *page, int count) { VM_BUG_ON_PAGE(PageTail(page), page); -- cgit v1.2.3 From 78525c74d9e7d1a6ce69bd4388f045f6e474a20b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 11 Aug 2021 09:49:13 +0100 Subject: netfs, 9p, afs, ceph: Use folios Convert the netfs helper library to use folios throughout, convert the 9p and afs filesystems to use folios in their file I/O paths and convert the ceph filesystem to use just enough folios to compile. With these changes, afs passes -g quick xfstests. Changes ======= ver #5: - Got rid of folio_end{io,_read,_write}() and inlined the stuff it does instead (Willy decided he didn't want this after all). ver #4: - Fixed a bug in afs_redirty_page() whereby it didn't set the next page index in the loop and returned too early. - Simplified a check in v9fs_vfs_write_folio_locked()[1]. - Undid a change to afs_symlink_readpage()[1]. - Used offset_in_folio() in afs_write_end()[1]. - Changed from using page_endio() to folio_end{io,_read,_write}()[1]. ver #2: - Add 9p foliation. Signed-off-by: David Howells Reviewed-by: Jeff Layton Tested-by: Jeff Layton Tested-by: Dominique Martinet Tested-by: kafs-testing@auristor.com cc: Matthew Wilcox (Oracle) cc: Marc Dionne cc: Ilya Dryomov cc: Dominique Martinet cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/YYKa3bfQZxK5/wDN@casper.infradead.org/ [1] Link: https://lore.kernel.org/r/2408234.1628687271@warthog.procyon.org.uk/ # rfc Link: https://lore.kernel.org/r/162877311459.3085614.10601478228012245108.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/162981153551.1901565.3124454657133703341.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/163005745264.2472992.9852048135392188995.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163584187452.4023316.500389675405550116.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/163649328026.309189.1124218109373941936.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/163657852454.834781.9265101983152100556.stgit@warthog.procyon.org.uk/ # v5 --- include/linux/netfs.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 12c4177f7703..ca0683b9e3d1 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -166,13 +166,13 @@ struct netfs_read_request { short error; /* 0 or error that occurred */ loff_t i_size; /* Size of the file */ loff_t start; /* Start position */ - pgoff_t no_unlock_page; /* Don't unlock this page after read */ + pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ refcount_t usage; unsigned long flags; #define NETFS_RREQ_INCOMPLETE_IO 0 /* Some ioreqs terminated short or with error */ #define NETFS_RREQ_WRITE_TO_CACHE 1 /* Need to write to the cache */ -#define NETFS_RREQ_NO_UNLOCK_PAGE 2 /* Don't unlock no_unlock_page on completion */ -#define NETFS_RREQ_DONT_UNLOCK_PAGES 3 /* Don't unlock the pages on completion */ +#define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ +#define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ const struct netfs_read_request_ops *netfs_ops; @@ -190,7 +190,7 @@ struct netfs_read_request_ops { void (*issue_op)(struct netfs_read_subrequest *subreq); bool (*is_still_valid)(struct netfs_read_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, - struct page *page, void **_fsdata); + struct folio *folio, void **_fsdata); void (*done)(struct netfs_read_request *rreq); void (*cleanup)(struct address_space *mapping, void *netfs_priv); }; @@ -240,11 +240,11 @@ extern void netfs_readahead(struct readahead_control *, const struct netfs_read_request_ops *, void *); extern int netfs_readpage(struct file *, - struct page *, + struct folio *, const struct netfs_read_request_ops *, void *); extern int netfs_write_begin(struct file *, struct address_space *, - loff_t, unsigned int, unsigned int, struct page **, + loff_t, unsigned int, unsigned int, struct folio **, void **, const struct netfs_read_request_ops *, void *); -- cgit v1.2.3 From 68dbbe7d5b4fde736d104cbbc9a2fce875562012 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 4 Nov 2021 17:31:58 +0900 Subject: libata: fix read log timeout value Some ATA drives are very slow to respond to READ_LOG_EXT and READ_LOG_DMA_EXT commands issued from ata_dev_configure() when the device is revalidated right after resuming a system or inserting the ATA adapter driver (e.g. ahci). The default 5s timeout (ATA_EH_CMD_DFL_TIMEOUT) used for these commands is too short, causing errors during the device configuration. Ex: ... ata9: SATA max UDMA/133 abar m524288@0x9d200000 port 0x9d200400 irq 209 ata9: SATA link up 6.0 Gbps (SStatus 133 SControl 300) ata9.00: ATA-9: XXX XXXXXXXXXXXXXXX, XXXXXXXX, max UDMA/133 ata9.00: qc timeout (cmd 0x2f) ata9.00: Read log page 0x00 failed, Emask 0x4 ata9.00: Read log page 0x00 failed, Emask 0x40 ata9.00: NCQ Send/Recv Log not supported ata9.00: Read log page 0x08 failed, Emask 0x40 ata9.00: 27344764928 sectors, multi 16: LBA48 NCQ (depth 32), AA ata9.00: Read log page 0x00 failed, Emask 0x40 ata9.00: ATA Identify Device Log not supported ata9.00: failed to set xfermode (err_mask=0x40) ata9: SATA link up 6.0 Gbps (SStatus 133 SControl 300) ata9.00: configured for UDMA/133 ... The timeout error causes a soft reset of the drive link, followed in most cases by a successful revalidation as that give enough time to the drive to become fully ready to quickly process the read log commands. However, in some cases, this also fails resulting in the device being dropped. Fix this by using adding the ata_eh_revalidate_timeouts entries for the READ_LOG_EXT and READ_LOG_DMA_EXT commands. This defines a timeout increased to 15s, retriable one time. Reported-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 2884383f1718..5331557316e8 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -394,7 +394,7 @@ enum { /* This should match the actual table size of * ata_eh_cmd_timeout_table in libata-eh.c. */ - ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 6, + ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 7, /* Horkage types. May be set by libata or controller on drives (some horkage may be drive/controller pair dependent */ -- cgit v1.2.3 From a25efb3863d068929f0bbeb87a995df11507e691 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 23 Sep 2021 13:57:42 +0200 Subject: dma-buf: add dma_fence_describe and dma_resv_describe v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add functions to dump dma_fence and dma_resv objects into a seq_file and use them for printing the debugfs information. v2: fix missing include reported by test robot. Signed-off-by: Christian König Reviewed-by: Rob Clark Link: https://patchwork.freedesktop.org/patch/msgid/20211103081231.18578-2-christian.koenig@amd.com --- include/linux/dma-fence.h | 1 + include/linux/dma-resv.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index a706b7bf51d7..1ea691753bd3 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -264,6 +264,7 @@ void dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, void dma_fence_release(struct kref *kref); void dma_fence_free(struct dma_fence *fence); +void dma_fence_describe(struct dma_fence *fence, struct seq_file *seq); /** * dma_fence_put - decreases refcount of the fence diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h index dbd235ab447f..09c6063b199a 100644 --- a/include/linux/dma-resv.h +++ b/include/linux/dma-resv.h @@ -490,5 +490,6 @@ int dma_resv_copy_fences(struct dma_resv *dst, struct dma_resv *src); long dma_resv_wait_timeout(struct dma_resv *obj, bool wait_all, bool intr, unsigned long timeout); bool dma_resv_test_signaled(struct dma_resv *obj, bool test_all); +void dma_resv_describe(struct dma_resv *obj, struct seq_file *seq); #endif /* _LINUX_RESERVATION_H */ -- cgit v1.2.3 From 9c8e9c9681a0f3f1ae90a90230d059c7a1dece5a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Nov 2021 00:27:29 +0100 Subject: PCI/MSI: Move non-mask check back into low level accessors The recent rework of PCI/MSI[X] masking moved the non-mask checks from the low level accessors into the higher level mask/unmask functions. This missed the fact that these accessors can be invoked from other places as well. The missing checks break XEN-PV which sets pci_msi_ignore_mask and also violates the virtual MSIX and the msi_attrib.maskbit protections. Instead of sprinkling checks all over the place, lift them back into the low level accessor functions. To avoid checking three different conditions combine them into one property of msi_desc::msi_attrib. [ josef: Fixed the missed conversion in the core code ] Fixes: fcacdfbef5a1 ("PCI/MSI: Provide a new set of mask and unmask functions") Reported-by: Josef Johansson Signed-off-by: Thomas Gleixner Tested-by: Josef Johansson Cc: Bjorn Helgaas Cc: stable@vger.kernel.org --- include/linux/msi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 49cf6eb222e7..e616f94c7c58 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -148,7 +148,7 @@ struct msi_desc { u8 is_msix : 1; u8 multiple : 3; u8 multi_cap : 3; - u8 maskbit : 1; + u8 can_mask : 1; u8 is_64 : 1; u8 is_virtual : 1; u16 entry_nr; -- cgit v1.2.3 From 2226667a145db2e1f314d7f57fd644fe69863ab9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 4 Nov 2021 18:01:29 +0000 Subject: PCI/MSI: Deal with devices lying about their MSI mask capability It appears that some devices are lying about their mask capability, pretending that they don't have it, while they actually do. The net result is that now that we don't enable MSIs on such endpoint. Add a new per-device flag to deal with this. Further patches will make use of it, sadly. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20211104180130.3825416-2-maz@kernel.org Cc: Bjorn Helgaas --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index c8afbee5da4b..d0dba7fd9848 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -233,6 +233,8 @@ enum pci_dev_flags { PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10), /* Don't use Relaxed Ordering for TLPs directed at this device */ PCI_DEV_FLAGS_NO_RELAXED_ORDERING = (__force pci_dev_flags_t) (1 << 11), + /* Device does honor MSI masking despite saying otherwise */ + PCI_DEV_FLAGS_HAS_MSI_MASKING = (__force pci_dev_flags_t) (1 << 12), }; enum pci_irq_reroute_variant { -- cgit v1.2.3 From a8b76910e465d718effce0cad306a21fa4f3526b Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 10 Nov 2021 20:24:44 +0000 Subject: preempt: Restore preemption model selection configs Commit c597bfddc9e9 ("sched: Provide Kconfig support for default dynamic preempt mode") changed the selectable config names for the preemption model. This means a config file must now select CONFIG_PREEMPT_BEHAVIOUR=y rather than CONFIG_PREEMPT=y to get a preemptible kernel. This means all arch config files would need to be updated - right now they'll all end up with the default CONFIG_PREEMPT_NONE_BEHAVIOUR. Rather than touch a good hundred of config files, restore usage of CONFIG_PREEMPT{_NONE, _VOLUNTARY}. Make them configure: o The build-time preemption model when !PREEMPT_DYNAMIC o The default boot-time preemption model when PREEMPT_DYNAMIC Add siblings of those configs with the _BUILD suffix to unconditionally designate the build-time preemption model (PREEMPT_DYNAMIC is built with the "highest" preemption model it supports, aka PREEMPT). Downstream configs should by now all be depending / selected by CONFIG_PREEMPTION rather than CONFIG_PREEMPT, so only a few sites need patching up. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Acked-by: Marco Elver Link: https://lore.kernel.org/r/20211110202448.4054153-2-valentin.schneider@arm.com --- include/linux/kernel.h | 2 +- include/linux/vermagic.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 968b4c4fe65b..77755ac3e189 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -85,7 +85,7 @@ struct completion; struct user; -#ifdef CONFIG_PREEMPT_VOLUNTARY +#ifdef CONFIG_PREEMPT_VOLUNTARY_BUILD extern int __cond_resched(void); # define might_resched() __cond_resched() diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index 1eaaa93c37bf..329d63babaeb 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -15,7 +15,7 @@ #else #define MODULE_VERMAGIC_SMP "" #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_BUILD #define MODULE_VERMAGIC_PREEMPT "preempt " #elif defined(CONFIG_PREEMPT_RT) #define MODULE_VERMAGIC_PREEMPT "preempt_rt " -- cgit v1.2.3 From 2f70ddb1f71814aae525c58086fcb2f6974e6591 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 24 Aug 2021 11:06:40 +0000 Subject: EFI: Introduce the new AMD Memory Encryption GUID. Introduce a new AMD Memory Encryption GUID which is currently used for defining a new UEFI environment variable which indicates UEFI/OVMF support for the SEV live migration feature. This variable is setup when UEFI/OVMF detects host/hypervisor support for SEV live migration and later this variable is read by the kernel using EFI runtime services to verify if OVMF supports the live migration feature. Signed-off-by: Ashish Kalra Acked-by: Ard Biesheuvel Message-Id: <1cea22976d2208f34d47e0c1ce0ecac816c13111.1629726117.git.ashish.kalra@amd.com> Signed-off-by: Paolo Bonzini --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 6b5d36babfcc..dbd39b20e034 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -362,6 +362,7 @@ void efi_native_runtime_setup(void); /* OEM GUIDs */ #define DELLEMC_EFI_RCI2_TABLE_GUID EFI_GUID(0x2d9f28a2, 0xa886, 0x456a, 0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55) +#define AMD_SEV_MEM_ENCRYPT_GUID EFI_GUID(0x0cf29b71, 0x9e51, 0x433a, 0xa3, 0xb7, 0x81, 0xf3, 0xab, 0x16, 0xb8, 0x75) typedef struct { efi_guid_t guid; -- cgit v1.2.3 From f4d316537059b274452727e86f46ff3bdefdde4d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Nov 2021 10:13:38 -0500 Subject: KVM: generalize "bugged" VM to "dead" VM Generalize KVM_REQ_VM_BUGGED so that it can be called even in cases where it is by design that the VM cannot be operated upon. In this case any KVM_BUG_ON should still warn, so introduce a new flag kvm->vm_dead that is separate from kvm->vm_bugged. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 60a35d9fe259..9e0667e3723e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -150,7 +150,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_UNBLOCK 2 #define KVM_REQ_UNHALT 3 -#define KVM_REQ_VM_BUGGED (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_VM_DEAD (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQUEST_ARCH_BASE 8 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ @@ -617,6 +617,7 @@ struct kvm { unsigned int max_halt_poll_ns; u32 dirty_ring_size; bool vm_bugged; + bool vm_dead; #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; @@ -650,12 +651,19 @@ struct kvm { #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) +static inline void kvm_vm_dead(struct kvm *kvm) +{ + kvm->vm_dead = true; + kvm_make_all_cpus_request(kvm, KVM_REQ_VM_DEAD); +} + static inline void kvm_vm_bugged(struct kvm *kvm) { kvm->vm_bugged = true; - kvm_make_all_cpus_request(kvm, KVM_REQ_VM_BUGGED); + kvm_vm_dead(kvm); } + #define KVM_BUG(cond, kvm, fmt...) \ ({ \ int __ret = (cond); \ -- cgit v1.2.3 From 0093de693fe7baf31641d1863793e6db93a43b6e Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Wed, 10 Nov 2021 20:32:30 -0800 Subject: mm/page_owner.c: modify the type of argument "order" in some functions The type of "order" in struct page_owner is unsigned short. However, it is unsigned int in the following 3 functions: __reset_page_owner __set_page_owner_handle __set_page_owner_handle The type of "order" in argument list is unsigned int, which is inconsistent. [akpm@linux-foundation.org: update include/linux/page_owner.h] Link: https://lkml.kernel.org/r/20211020125945.47792-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_owner.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 43c638c51c1f..119a0c9d2a8b 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -8,9 +8,9 @@ extern struct static_key_false page_owner_inited; extern struct page_ext_operations page_owner_ops; -extern void __reset_page_owner(struct page *page, unsigned int order); +extern void __reset_page_owner(struct page *page, unsigned short order); extern void __set_page_owner(struct page *page, - unsigned int order, gfp_t gfp_mask); + unsigned short order, gfp_t gfp_mask); extern void __split_page_owner(struct page *page, unsigned int nr); extern void __folio_copy_owner(struct folio *newfolio, struct folio *old); extern void __set_page_owner_migrate_reason(struct page *page, int reason); @@ -18,14 +18,14 @@ extern void __dump_page_owner(const struct page *page); extern void pagetypeinfo_showmixedcount_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone); -static inline void reset_page_owner(struct page *page, unsigned int order) +static inline void reset_page_owner(struct page *page, unsigned short order) { if (static_branch_unlikely(&page_owner_inited)) __reset_page_owner(page, order); } static inline void set_page_owner(struct page *page, - unsigned int order, gfp_t gfp_mask) + unsigned short order, gfp_t gfp_mask) { if (static_branch_unlikely(&page_owner_inited)) __set_page_owner(page, order, gfp_mask); @@ -52,7 +52,7 @@ static inline void dump_page_owner(const struct page *page) __dump_page_owner(page); } #else -static inline void reset_page_owner(struct page *page, unsigned int order) +static inline void reset_page_owner(struct page *page, unsigned short order) { } static inline void set_page_owner(struct page *page, @@ -60,7 +60,7 @@ static inline void set_page_owner(struct page *page, { } static inline void split_page_owner(struct page *page, - unsigned int order) + unsigned short order) { } static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio) -- cgit v1.2.3 From ab09243aa95a72bac5c71e852773de34116f8d0f Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 10 Nov 2021 20:32:40 -0800 Subject: mm/migrate.c: remove MIGRATE_PFN_LOCKED MIGRATE_PFN_LOCKED is used to indicate to migrate_vma_prepare() that a source page was already locked during migrate_vma_collect(). If it wasn't then the a second attempt is made to lock the page. However if the first attempt failed it's unlikely a second attempt will succeed, and the retry adds complexity. So clean this up by removing the retry and MIGRATE_PFN_LOCKED flag. Destination pages are also meant to have the MIGRATE_PFN_LOCKED flag set, but nothing actually checks that. Link: https://lkml.kernel.org/r/20211025041608.289017-1-apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Ralph Campbell Acked-by: Felix Kuehling Cc: Alex Deucher Cc: Jerome Glisse Cc: John Hubbard Cc: Zi Yan Cc: Christoph Hellwig Cc: Ben Skeggs Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index eeb818c4fc78..4850cc5bf813 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -110,7 +110,6 @@ static inline int migrate_misplaced_page(struct page *page, */ #define MIGRATE_PFN_VALID (1UL << 0) #define MIGRATE_PFN_MIGRATE (1UL << 1) -#define MIGRATE_PFN_LOCKED (1UL << 2) #define MIGRATE_PFN_WRITE (1UL << 3) #define MIGRATE_PFN_SHIFT 6 -- cgit v1.2.3 From 68da4e0eaaab421f228eac57cbe7505b136464af Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 10 Nov 2021 12:01:14 -0600 Subject: Revert "PCI: Remove struct pci_dev->driver" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit b5f9c644eb1baafcd349ad134e2110773f8d0a38. Revert b5f9c644eb1b ("PCI: Remove struct pci_dev->driver"), which is needed to revert 2a4d9408c9e8 ("PCI: Use to_pci_driver() instead of pci_dev->driver"). 2a4d9408c9e8 caused a NULL pointer dereference reported by Robert Úwięcki. Details in the revert of that commit. Fixes: 2a4d9408c9e8 ("PCI: Use to_pci_driver() instead of pci_dev->driver") Link: https://lore.kernel.org/linux-i2c/CAP145pgdrdiMAT7=-iB1DMgA7t_bMqTcJL4N0=6u8kNY3EU0dw@mail.gmail.com/ Reported-by: Robert Úwięcki Tested-by: Robert Úwięcki Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index b4dbcc86b3f1..e58888e21ab7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -342,6 +342,7 @@ struct pci_dev { u16 pcie_flags_reg; /* Cached PCIe Capabilities Register */ unsigned long *dma_alias_mask;/* Mask of enabled devfn aliases */ + struct pci_driver *driver; /* Driver bound to this device */ u64 dma_mask; /* Mask of the bits of bus address this device implements. Normally this is 0xffffffff. You only need to change -- cgit v1.2.3 From bf9167a8b40c9cf463521da05342db81808c1b6e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 Nov 2021 09:56:33 +0100 Subject: HID: intel-ish-hid: fix module device-id handling A late addititon to the intel-ish-hid framework caused a build failure with clang, and introduced an ABI to the module loader that stops working if any driver ever needs to bind to more than one UUID: drivers/hid/intel-ish-hid/ishtp-fw-loader.c:1067:4: error: initializer element is not a compile-time constant Change the ishtp_device_id to have correct documentation and a driver_data field like all the other ones, and change the drivers to use the ID table as the primary identification in a way that works with all compilers and avoids duplciating the identifiers. Fixes: f155dfeaa4ee ("platform/x86: isthp_eclite: only load for matching devices") Fixes: facfe0a4fdce ("platform/chrome: chros_ec_ishtp: only load for matching devices") Fixes: 0d0cccc0fd83 ("HID: intel-ish-hid: hid-client: only load for matching devices") Fixes: 44e2a58cb880 ("HID: intel-ish-hid: fw-loader: only load for matching devices") Fixes: cb1a2c6847f7 ("HID: intel-ish-hid: use constants for modaliases") Fixes: fa443bc3c1e4 ("HID: intel-ish-hid: add support for MODULE_DEVICE_TABLE()") Signed-off-by: Arnd Bergmann Reviewed-by: Hans de Goede [jkosina@suse.cz: fix ecl_ishtp_cl_driver.id initialization] [jkosina@suse.cz: fix conflict with already fixed kerneldoc] Signed-off-by: Jiri Kosina --- include/linux/intel-ish-client-if.h | 4 ++-- include/linux/mod_devicetable.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index aee8ff4739b1..f45f13304add 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -9,7 +9,7 @@ #define _INTEL_ISH_CLIENT_IF_H_ #include -#include +#include struct ishtp_cl_device; struct ishtp_device; @@ -40,7 +40,7 @@ enum cl_state { struct ishtp_cl_driver { struct device_driver driver; const char *name; - const guid_t *guid; + const struct ishtp_device_id *id; int (*probe)(struct ishtp_cl_device *dev); void (*remove)(struct ishtp_cl_device *dev); int (*reset)(struct ishtp_cl_device *dev); diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index c70abe7aaef2..4bb71979a8fd 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -902,9 +902,11 @@ struct dfl_device_id { /** * struct ishtp_device_id - ISHTP device identifier * @guid: GUID of the device. + * @driver_data: pointer to driver specific data */ struct ishtp_device_id { guid_t guid; + kernel_ulong_t driver_data; }; #endif /* LINUX_MOD_DEVICETABLE_H */ -- cgit v1.2.3 From 636f6e2af4fb916b9f1b432964294b6979c34002 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 9 Nov 2021 08:45:25 +0900 Subject: libata: add horkage for missing Identify Device log ACS-3 introduced the ATA Identify Device Data log as mandatory. A warning message currently signals to the user if a device does not report supporting this log page in the log directory page, regardless of the ATA version of the device. Furthermore, this warning will appear for all attempts at accessing this missing log page during device revalidation. Since it is useless to constantly access the log directory and warn about this lack of support once we have discovered that the device does not support this log page, introduce the horkage flag ATA_HORKAGE_NO_ID_DEV_LOG to mark a device as lacking support for the Identify Device Data log page. Set this flag when ata_log_supported() returns false in ata_identify_page_supported(). The warning is printed only if the device ATA level is 10 or above (ACS-3 or above), and only once on device scan. With this flag set, the log directory page is not accessed again to test for Identify Device Data log page support. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 5331557316e8..2a8404b26083 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -427,6 +427,7 @@ enum { ATA_HORKAGE_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ ATA_HORKAGE_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27), /* Disable NCQ on ATI chipset */ + ATA_HORKAGE_NO_ID_DEV_LOG = (1 << 28), /* Identify device log missing */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From 32a370abf12f82c8383e430c21365f5355d8b288 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 12 Nov 2021 12:07:02 -0500 Subject: net,lsm,selinux: revert the security_sctp_assoc_established() hook This patch reverts two prior patches, e7310c94024c ("security: implement sctp_assoc_established hook in selinux") and 7c2ef0240e6a ("security: add sctp_assoc_established hook"), which create the security_sctp_assoc_established() LSM hook and provide a SELinux implementation. Unfortunately these two patches were merged without proper review (the Reviewed-by and Tested-by tags from Richard Haines were for previous revisions of these patches that were significantly different) and there are outstanding objections from the SELinux maintainers regarding these patches. Work is currently ongoing to correct the problems identified in the reverted patches, as well as others that have come up during review, but it is unclear at this point in time when that work will be ready for inclusion in the mainline kernel. In the interest of not keeping objectionable code in the kernel for multiple weeks, and potentially a kernel release, we are reverting the two problematic patches. Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 -- include/linux/lsm_hooks.h | 5 ----- include/linux/security.h | 7 ------- 3 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 442a611fa0fb..df8de62f4710 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -335,8 +335,6 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) -LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc, - struct sk_buff *skb) #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d6823214d5c1..d45b6f6e27fd 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1050,11 +1050,6 @@ * @asoc pointer to current sctp association structure. * @sk pointer to current sock structure. * @newsk pointer to new sock structure. - * @sctp_assoc_established: - * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet - * to the security module. - * @asoc pointer to sctp association structure. - * @skb pointer to skbuff of association packet. * * Security hooks for Infiniband * diff --git a/include/linux/security.h b/include/linux/security.h index 06eac4e61a13..bbf44a466832 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1430,8 +1430,6 @@ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen); void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk); -void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb); #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct sock *sock, @@ -1651,11 +1649,6 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *newsk) { } - -static inline void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb) -{ -} #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND -- cgit v1.2.3 From 1aa3b2207e889a948049c9a8016cedb0218c2389 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 12 Nov 2021 18:18:10 -0500 Subject: net,lsm,selinux: revert the security_sctp_assoc_established() hook This patch reverts two prior patches, e7310c94024c ("security: implement sctp_assoc_established hook in selinux") and 7c2ef0240e6a ("security: add sctp_assoc_established hook"), which create the security_sctp_assoc_established() LSM hook and provide a SELinux implementation. Unfortunately these two patches were merged without proper review (the Reviewed-by and Tested-by tags from Richard Haines were for previous revisions of these patches that were significantly different) and there are outstanding objections from the SELinux maintainers regarding these patches. Work is currently ongoing to correct the problems identified in the reverted patches, as well as others that have come up during review, but it is unclear at this point in time when that work will be ready for inclusion in the mainline kernel. In the interest of not keeping objectionable code in the kernel for multiple weeks, and potentially a kernel release, we are reverting the two problematic patches. Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- include/linux/lsm_hook_defs.h | 2 -- include/linux/lsm_hooks.h | 5 ----- include/linux/security.h | 7 ------- 3 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 442a611fa0fb..df8de62f4710 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -335,8 +335,6 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) -LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc, - struct sk_buff *skb) #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d6823214d5c1..d45b6f6e27fd 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1050,11 +1050,6 @@ * @asoc pointer to current sctp association structure. * @sk pointer to current sock structure. * @newsk pointer to new sock structure. - * @sctp_assoc_established: - * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet - * to the security module. - * @asoc pointer to sctp association structure. - * @skb pointer to skbuff of association packet. * * Security hooks for Infiniband * diff --git a/include/linux/security.h b/include/linux/security.h index 06eac4e61a13..bbf44a466832 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1430,8 +1430,6 @@ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen); void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk); -void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb); #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct sock *sock, @@ -1651,11 +1649,6 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *newsk) { } - -static inline void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb) -{ -} #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND -- cgit v1.2.3 From 938aa33f14657c9ed9deea348b7d6f14b6d69cb7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 14 Nov 2021 13:28:34 -0500 Subject: tracing: Add length protection to histogram string copies The string copies to the histogram storage has a max size of 256 bytes (defined by MAX_FILTER_STR_VAL). Only the string size of the event field needs to be copied to the event storage, but no more than what is in the event storage. Although nothing should be bigger than 256 bytes, there's no protection against overwriting of the storage if one day there is. Copy no more than the destination size, and enforce it. Also had to turn MAX_FILTER_STR_VAL into an unsigned int, to keep the min() comparison of the string sizes of comparable types. Link: https://lore.kernel.org/all/CAHk-=wjREUihCGrtRBwfX47y_KrLCGjiq3t6QtoNJpmVrAEb1w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211114132834.183429a4@rorschach.local.home Cc: Ingo Molnar Cc: Andrew Morton Cc: Tom Zanussi Reported-by: Linus Torvalds Reviewed-by: Masami Hiramatsu Fixes: 63f84ae6b82b ("tracing/histogram: Do not copy the fixed-size char array field over the field size") Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 50453b287615..2d167ac3452c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -673,7 +673,7 @@ struct trace_event_file { #define PERF_MAX_TRACE_SIZE 8192 -#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ +#define MAX_FILTER_STR_VAL 256U /* Should handle KSYM_SYMBOL_LEN */ enum event_trigger_type { ETT_NONE = (0), -- cgit v1.2.3 From 10a2308ffb8cf262e473eb324fde42ae31b6da04 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 12 Nov 2021 18:16:34 +0800 Subject: net: Clean up some inconsistent indenting Eliminate the follow smatch warning: ./include/linux/skbuff.h:4229 skb_remcsum_process() warn: inconsistent indenting. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 686a666d073d..c8cb7e697d47 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4226,7 +4226,7 @@ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr, return; } - if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { + if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { __skb_checksum_complete(skb); skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data); } -- cgit v1.2.3 From a0ddee65c527d877e798205c1391c6170e580c66 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 12 Nov 2021 16:07:49 +0200 Subject: printk: Remove printk.h inclusion in percpu.h After the commit 42a0bb3f7138 ("printk/nmi: generic solution for safe printk in NMI") the printk.h is not needed anymore in percpu.h. Moreover `make headerdep` complains (an excerpt) In file included from linux/printk.h, from linux/dynamic_debug.h:188 from linux/printk.h:559 <-- here from linux/percpu.h:9 from linux/idr.h:17 include/net/9p/client.h:13: warning: recursive header inclusion Yeah, it's not a root cause of this, but removing will help to reduce the noise. Fixes: 42a0bb3f7138 ("printk/nmi: generic solution for safe printk in NMI") Signed-off-by: Andy Shevchenko Acked-by: Dennis Zhou Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211112140749.80042-1-andriy.shevchenko@linux.intel.com --- include/linux/percpu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 5e76af742c80..4fa3000f9c22 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From e9380df851878cee71df5a1c7611584421527f7e Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Sun, 31 Oct 2021 20:48:52 -0500 Subject: ACPI: Add stubs for wakeup handler functions The commit ddfd9dcf270c ("ACPI: PM: Add acpi_[un]register_wakeup_handler()") added new functions for drivers to use during the s2idle wakeup path, but didn't add stubs for when CONFIG_ACPI wasn't set. Add those stubs in for other drivers to be able to use. Fixes: ddfd9dcf270c ("ACPI: PM: Add acpi_[un]register_wakeup_handler()") Acked-by: Rafael J. Wysocki Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20211101014853.6177-1-mario.limonciello@amd.com Signed-off-by: Linus Walleij --- include/linux/acpi.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 143ce7e0bee1..668d007f0917 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -974,6 +974,15 @@ static inline int acpi_get_local_address(acpi_handle handle, u32 *addr) return -ENODEV; } +static inline int acpi_register_wakeup_handler(int wake_irq, + bool (*wakeup)(void *context), void *context) +{ + return -ENXIO; +} + +static inline void acpi_unregister_wakeup_handler( + bool (*wakeup)(void *context), void *context) { } + #endif /* !CONFIG_ACPI */ #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC -- cgit v1.2.3 From a3143f7822a9eeb38f0e046080ae8f79f6c7122d Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 2 Nov 2021 16:01:58 -0600 Subject: Remove unused header Commit 6a80b30086b8 ("fmc: Delete the FMC subsystem") removed the last user of , but left the header file behind. Nothing uses this file, delete it now. Cc: Linus Walleij Cc: Alessandro Rubini Signed-off-by: Jonathan Corbet Acked-by: Alessandro Rubini Link: https://lore.kernel.org/r/20211102220203.940290-5-corbet@lwn.net Signed-off-by: Linus Walleij --- include/linux/sdb.h | 160 ---------------------------------------------------- 1 file changed, 160 deletions(-) delete mode 100644 include/linux/sdb.h (limited to 'include/linux') diff --git a/include/linux/sdb.h b/include/linux/sdb.h deleted file mode 100644 index a2404a2bbd10..000000000000 --- a/include/linux/sdb.h +++ /dev/null @@ -1,160 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This is the official version 1.1 of sdb.h - */ -#ifndef __SDB_H__ -#define __SDB_H__ -#ifdef __KERNEL__ -#include -#else -#include -#endif - -/* - * All structures are 64 bytes long and are expected - * to live in an array, one for each interconnect. - * Most fields of the structures are shared among the - * various types, and most-specific fields are at the - * beginning (for alignment reasons, and to keep the - * magic number at the head of the interconnect record - */ - -/* Product, 40 bytes at offset 24, 8-byte aligned - * - * device_id is vendor-assigned; version is device-specific, - * date is hex (e.g 0x20120501), name is UTF-8, blank-filled - * and not terminated with a 0 byte. - */ -struct sdb_product { - uint64_t vendor_id; /* 0x18..0x1f */ - uint32_t device_id; /* 0x20..0x23 */ - uint32_t version; /* 0x24..0x27 */ - uint32_t date; /* 0x28..0x2b */ - uint8_t name[19]; /* 0x2c..0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* - * Component, 56 bytes at offset 8, 8-byte aligned - * - * The address range is first to last, inclusive - * (for example 0x100000 - 0x10ffff) - */ -struct sdb_component { - uint64_t addr_first; /* 0x08..0x0f */ - uint64_t addr_last; /* 0x10..0x17 */ - struct sdb_product product; /* 0x18..0x3f */ -}; - -/* Type of the SDB record */ -enum sdb_record_type { - sdb_type_interconnect = 0x00, - sdb_type_device = 0x01, - sdb_type_bridge = 0x02, - sdb_type_integration = 0x80, - sdb_type_repo_url = 0x81, - sdb_type_synthesis = 0x82, - sdb_type_empty = 0xFF, -}; - -/* Type 0: interconnect (first of the array) - * - * sdb_records is the length of the table including this first - * record, version is 1. The bus type is enumerated later. - */ -#define SDB_MAGIC 0x5344422d /* "SDB-" */ -struct sdb_interconnect { - uint32_t sdb_magic; /* 0x00-0x03 */ - uint16_t sdb_records; /* 0x04-0x05 */ - uint8_t sdb_version; /* 0x06 */ - uint8_t sdb_bus_type; /* 0x07 */ - struct sdb_component sdb_component; /* 0x08-0x3f */ -}; - -/* Type 1: device - * - * class is 0 for "custom device", other values are - * to be standardized; ABI version is for the driver, - * bus-specific bits are defined by each bus (see below) - */ -struct sdb_device { - uint16_t abi_class; /* 0x00-0x01 */ - uint8_t abi_ver_major; /* 0x02 */ - uint8_t abi_ver_minor; /* 0x03 */ - uint32_t bus_specific; /* 0x04-0x07 */ - struct sdb_component sdb_component; /* 0x08-0x3f */ -}; - -/* Type 2: bridge - * - * child is the address of the nested SDB table - */ -struct sdb_bridge { - uint64_t sdb_child; /* 0x00-0x07 */ - struct sdb_component sdb_component; /* 0x08-0x3f */ -}; - -/* Type 0x80: integration - * - * all types with bit 7 set are meta-information, so - * software can ignore the types it doesn't know. Here we - * just provide product information for an aggregate device - */ -struct sdb_integration { - uint8_t reserved[24]; /* 0x00-0x17 */ - struct sdb_product product; /* 0x08-0x3f */ -}; - -/* Type 0x81: Top module repository url - * - * again, an informative field that software can ignore - */ -struct sdb_repo_url { - uint8_t repo_url[63]; /* 0x00-0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* Type 0x82: Synthesis tool information - * - * this informative record - */ -struct sdb_synthesis { - uint8_t syn_name[16]; /* 0x00-0x0f */ - uint8_t commit_id[16]; /* 0x10-0x1f */ - uint8_t tool_name[8]; /* 0x20-0x27 */ - uint32_t tool_version; /* 0x28-0x2b */ - uint32_t date; /* 0x2c-0x2f */ - uint8_t user_name[15]; /* 0x30-0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* Type 0xff: empty - * - * this allows keeping empty slots during development, - * so they can be filled later with minimal efforts and - * no misleading description is ever shipped -- hopefully. - * It can also be used to pad a table to a desired length. - */ -struct sdb_empty { - uint8_t reserved[63]; /* 0x00-0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* The type of bus, for bus-specific flags */ -enum sdb_bus_type { - sdb_wishbone = 0x00, - sdb_data = 0x01, -}; - -#define SDB_WB_WIDTH_MASK 0x0f -#define SDB_WB_ACCESS8 0x01 -#define SDB_WB_ACCESS16 0x02 -#define SDB_WB_ACCESS32 0x04 -#define SDB_WB_ACCESS64 0x08 -#define SDB_WB_LITTLE_ENDIAN 0x80 - -#define SDB_DATA_READ 0x04 -#define SDB_DATA_WRITE 0x02 -#define SDB_DATA_EXEC 0x01 - -#endif /* __SDB_H__ */ -- cgit v1.2.3 From 353050be4c19e102178ccc05988101887c25ae53 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Nov 2021 18:48:08 +0000 Subject: bpf: Fix toctou on read-only map's constant scalar tracking Commit a23740ec43ba ("bpf: Track contents of read-only maps as scalars") is checking whether maps are read-only both from BPF program side and user space side, and then, given their content is constant, reading out their data via map->ops->map_direct_value_addr() which is then subsequently used as known scalar value for the register, that is, it is marked as __mark_reg_known() with the read value at verification time. Before a23740ec43ba, the register content was marked as an unknown scalar so the verifier could not make any assumptions about the map content. The current implementation however is prone to a TOCTOU race, meaning, the value read as known scalar for the register is not guaranteed to be exactly the same at a later point when the program is executed, and as such, the prior made assumptions of the verifier with regards to the program will be invalid which can cause issues such as OOB access, etc. While the BPF_F_RDONLY_PROG map flag is always fixed and required to be specified at map creation time, the map->frozen property is initially set to false for the map given the map value needs to be populated, e.g. for global data sections. Once complete, the loader "freezes" the map from user space such that no subsequent updates/deletes are possible anymore. For the rest of the lifetime of the map, this freeze one-time trigger cannot be undone anymore after a successful BPF_MAP_FREEZE cmd return. Meaning, any new BPF_* cmd calls which would update/delete map entries will be rejected with -EPERM since map_get_sys_perms() removes the FMODE_CAN_WRITE permission. This also means that pending update/delete map entries must still complete before this guarantee is given. This corner case is not an issue for loaders since they create and prepare such program private map in successive steps. However, a malicious user is able to trigger this TOCTOU race in two different ways: i) via userfaultfd, and ii) via batched updates. For i) userfaultfd is used to expand the competition interval, so that map_update_elem() can modify the contents of the map after map_freeze() and bpf_prog_load() were executed. This works, because userfaultfd halts the parallel thread which triggered a map_update_elem() at the time where we copy key/value from the user buffer and this already passed the FMODE_CAN_WRITE capability test given at that time the map was not "frozen". Then, the main thread performs the map_freeze() and bpf_prog_load(), and once that had completed successfully, the other thread is woken up to complete the pending map_update_elem() which then changes the map content. For ii) the idea of the batched update is similar, meaning, when there are a large number of updates to be processed, it can increase the competition interval between the two. It is therefore possible in practice to modify the contents of the map after executing map_freeze() and bpf_prog_load(). One way to fix both i) and ii) at the same time is to expand the use of the map's map->writecnt. The latter was introduced in fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") and further refined in 1f6cb19be2e2 ("bpf: Prevent re-mmap()'ing BPF map as writable for initially r/o mapping") with the rationale to make a writable mmap()'ing of a map mutually exclusive with read-only freezing. The counter indicates writable mmap() mappings and then prevents/fails the freeze operation. Its semantics can be expanded beyond just mmap() by generally indicating ongoing write phases. This would essentially span any parallel regular and batched flavor of update/delete operation and then also have map_freeze() fail with -EBUSY. For the check_mem_access() in the verifier we expand upon the bpf_map_is_rdonly() check ensuring that all last pending writes have completed via bpf_map_write_active() test. Once the map->frozen is set and bpf_map_write_active() indicates a map->writecnt of 0 only then we are really guaranteed to use the map's data as known constants. For map->frozen being set and pending writes in process of still being completed we fall back to marking that register as unknown scalar so we don't end up making assumptions about it. With this, both TOCTOU reproducers from i) and ii) are fixed. Note that the map->writecnt has been converted into a atomic64 in the fix in order to avoid a double freeze_mutex mutex_{un,}lock() pair when updating map->writecnt in the various map update/delete BPF_* cmd flavors. Spanning the freeze_mutex over entire map update/delete operations in syscall side would not be possible due to then causing everything to be serialized. Similarly, something like synchronize_rcu() after setting map->frozen to wait for update/deletes to complete is not possible either since it would also have to span the user copy which can sleep. On the libbpf side, this won't break d66562fba1ce ("libbpf: Add BPF object skeleton support") as the anonymous mmap()-ed "map initialization image" is remapped as a BPF map-backed mmap()-ed memory where for .rodata it's non-writable. Fixes: a23740ec43ba ("bpf: Track contents of read-only maps as scalars") Reported-by: w1tcher.bupt@gmail.com Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f715e8863f4d..e7a163a3146b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -193,7 +193,7 @@ struct bpf_map { atomic64_t usercnt; struct work_struct work; struct mutex freeze_mutex; - u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */ + atomic64_t writecnt; }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -1419,6 +1419,7 @@ void bpf_map_put(struct bpf_map *map); void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); +bool bpf_map_write_active(const struct bpf_map *map); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, -- cgit v1.2.3 From d7751d6476185ff754b9dad2cba0c0a6e43ecadc Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Thu, 20 May 2021 17:09:58 +0300 Subject: net/mlx5: E-Switch, Fix resetting of encap mode when entering switchdev E-Switch encap mode is relevant only when in switchdev mode. The RDMA driver can query the encap configuration via mlx5_eswitch_get_encap_mode(). Make sure it returns the currently used mode and not the set one. This reverts the cited commit which reset the encap mode on entering switchdev and fixes the original issue properly. Fixes: 9a64144d683a ("net/mlx5: E-Switch, Fix default encap mode") Signed-off-by: Paul Blakey Reviewed-by: Mark Bloch Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eswitch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 97afcea39a7b..8b18fe9771f9 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -145,13 +145,13 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, GENMASK(31 - ESW_TUN_ID_BITS - ESW_RESERVED_BITS, \ ESW_TUN_OPTS_OFFSET + 1) -u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev); +u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev); u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw); #else /* CONFIG_MLX5_ESWITCH */ -static inline u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev) +static inline u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev) { return MLX5_ESWITCH_NONE; } -- cgit v1.2.3 From 49c39ec4670a8f045729e3717af2e1a74caf89a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 13 Sep 2021 14:21:25 +0200 Subject: dma-buf: nuke dma_resv_get_excl_unlocked MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Heureka, that's finally not used any more. Signed-off-by: Christian König Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20210917123513.1106-27-christian.koenig@amd.com --- include/linux/dma-resv.h | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-resv.h b/include/linux/dma-resv.h index 09c6063b199a..eebf04325b34 100644 --- a/include/linux/dma-resv.h +++ b/include/linux/dma-resv.h @@ -440,32 +440,6 @@ dma_resv_excl_fence(struct dma_resv *obj) return rcu_dereference_check(obj->fence_excl, dma_resv_held(obj)); } -/** - * dma_resv_get_excl_unlocked - get the reservation object's - * exclusive fence, without lock held. - * @obj: the reservation object - * - * If there is an exclusive fence, this atomically increments it's - * reference count and returns it. - * - * RETURNS - * The exclusive fence or NULL if none - */ -static inline struct dma_fence * -dma_resv_get_excl_unlocked(struct dma_resv *obj) -{ - struct dma_fence *fence; - - if (!rcu_access_pointer(obj->fence_excl)) - return NULL; - - rcu_read_lock(); - fence = dma_fence_get_rcu_safe(&obj->fence_excl); - rcu_read_unlock(); - - return fence; -} - /** * dma_resv_shared_list - get the reservation object's shared fence list * @obj: the reservation object -- cgit v1.2.3 From f45b2974cc0ae959a4c503a071e38a56bd64372f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 17 Nov 2021 13:57:08 +0100 Subject: bpf, x86: Fix "no previous prototype" warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The arch_prepare_bpf_dispatcher function does not have a prototype, and yields the following warning when W=1 is enabled for the kernel build. >> arch/x86/net/bpf_jit_comp.c:2188:5: warning: no previous \ prototype for 'arch_prepare_bpf_dispatcher' [-Wmissing-prototypes] 2188 | int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, \ int num_funcs) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ Remove the warning by adding a function declaration to include/linux/bpf.h. Fixes: 75ccbef6369e ("bpf: Introduce BPF dispatcher") Reported-by: kernel test robot Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211117125708.769168-1-bjorn@kernel.org --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e7a163a3146b..84ff6ef49462 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -732,6 +732,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); +int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ -- cgit v1.2.3 From cf9acc90c80ecbee00334aa85d92f4e74014bcff Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Tue, 16 Nov 2021 17:42:42 +0000 Subject: net: virtio_net_hdr_to_skb: count transport header in UFO virtio_net_hdr_to_skb does not set the skb's gso_size and gso_type correctly for UFO packets received via virtio-net that are a little over the GSO size. This can lead to problems elsewhere in the networking stack, e.g. ovs_vport_send dropping over-sized packets if gso_size is not set. This is due to the comparison if (skb->len - p_off > gso_size) not properly accounting for the transport layer header. p_off includes the size of the transport layer header (thlen), so skb->len - p_off is the size of the TCP/UDP payload. gso_size is read from the virtio-net header. For UFO, fragmentation happens at the IP level so does not need to include the UDP header. Hence the calculation could be comparing a TCP/UDP payload length with an IP payload length, causing legitimate virtio-net packets to have lack gso_type/gso_size information. Example: a UDP packet with payload size 1473 has IP payload size 1481. If the guest used UFO, it is not fragmented and the virtio-net header's flags indicate that it is a GSO frame (VIRTIO_NET_HDR_GSO_UDP), with gso_size = 1480 for an MTU of 1500. skb->len will be 1515 and p_off will be 42, so skb->len - p_off = 1473. Hence the comparison fails, and shinfo->gso_size and gso_type are not set as they should be. Instead, add the UDP header length before comparing to gso_size when using UFO. In this way, it is the size of the IP payload that is compared to gso_size. Fixes: 6dd912f82680 ("net: check untrusted gso_size at kernel entry") Signed-off-by: Jonathan Davies Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index b465f8f3e554..04e87f4b9417 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -120,10 +120,15 @@ retry: if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); + unsigned int nh_off = p_off; struct skb_shared_info *shinfo = skb_shinfo(skb); + /* UFO may not include transport header in gso_size. */ + if (gso_type & SKB_GSO_UDP) + nh_off -= thlen; + /* Too small packets are not really GSO ones. */ - if (skb->len - p_off > gso_size) { + if (skb->len - nh_off > gso_size) { shinfo->gso_size = gso_size; shinfo->gso_type = gso_type; -- cgit v1.2.3 From 522a0032af005502507f5f81ae64fdcc82b5d068 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 6 Nov 2021 17:13:35 -0400 Subject: Add linux/cacheflush.h Many architectures do not include asm-generic/cacheflush.h, so turn the includes on their head and add linux/cacheflush.h which includes asm/cacheflush.h. Move the flush_dcache_folio() declaration from asm-generic/cacheflush.h to linux/cacheflush.h and change linux/highmem.h to include linux/cacheflush.h instead of asm/cacheflush.h so that all necessary places will see flush_dcache_folio(). More functions should have their default implementations moved in the future, but those are for follow-on patches. This fixes csky, sparc and sparc64 which were missed in the commit which added flush_dcache_folio(). Fixes: 08b0b0059bf1 ("mm: Add flush_dcache_folio()") Suggested-by: Christoph Hellwig Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Geert Uytterhoeven --- include/linux/cacheflush.h | 18 ++++++++++++++++++ include/linux/highmem.h | 3 +-- 2 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 include/linux/cacheflush.h (limited to 'include/linux') diff --git a/include/linux/cacheflush.h b/include/linux/cacheflush.h new file mode 100644 index 000000000000..fef8b607f97e --- /dev/null +++ b/include/linux/cacheflush.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CACHEFLUSH_H +#define _LINUX_CACHEFLUSH_H + +#include + +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +void flush_dcache_folio(struct folio *folio); +#endif +#else +static inline void flush_dcache_folio(struct folio *folio) +{ +} +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO 0 +#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ + +#endif /* _LINUX_CACHEFLUSH_H */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 25aff0f2ed0b..c944b3b70ee7 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -5,12 +5,11 @@ #include #include #include +#include #include #include #include -#include - #include "highmem-internal.h" /** -- cgit v1.2.3 From 9c3252152e8a6401c2b9e32490a5a16ec4472778 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Nov 2021 21:17:14 -0500 Subject: mm: Rename folio_test_multi to folio_test_large This is a better name. Also add kernel-doc. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/page-flags.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 52ec4b5e5615..05510118fbb8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -692,7 +692,13 @@ static inline bool folio_test_single(struct folio *folio) return !folio_test_head(folio); } -static inline bool folio_test_multi(struct folio *folio) +/** + * folio_test_large() - Does this folio contain more than one page? + * @folio: The folio to test. + * + * Return: True if the folio is larger than one page. + */ +static inline bool folio_test_large(struct folio *folio) { return folio_test_head(folio); } -- cgit v1.2.3 From a1efe484dd8c04c4c2d4eb1ee6b04d01cfc07ccc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Nov 2021 21:18:52 -0500 Subject: mm: Remove folio_test_single There's no need for this predicate; callers can just use !folio_test_large(). Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/page-flags.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 05510118fbb8..b5f14d581113 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -686,12 +686,6 @@ static inline bool test_set_page_writeback(struct page *page) __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) -/* Whether there are one or multiple pages in a folio */ -static inline bool folio_test_single(struct folio *folio) -{ - return !folio_test_head(folio); -} - /** * folio_test_large() - Does this folio contain more than one page? * @folio: The folio to test. -- cgit v1.2.3 From ff36da69bc90d80b0c73f47f4b2e270b3ff6da99 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 29 Aug 2021 06:07:03 -0400 Subject: fs: Remove FS_THP_SUPPORT Instead of setting a bit in the fs_flags to set a bit in the address_space, set the bit in the address_space directly. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- include/linux/fs.h | 1 - include/linux/pagemap.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1cb616fc1105..bbf812ce89a8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2518,7 +2518,6 @@ struct file_system_type { #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ -#define FS_THP_SUPPORT 8192 /* Remove once all fs converted */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 1a0c646eb6ff..9e33878bf23b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -176,6 +176,22 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } +/** + * mapping_set_large_folios() - Indicate the file supports large folios. + * @mapping: The file. + * + * The filesystem should call this function in its inode constructor to + * indicate that the VFS can use large folios to cache the contents of + * the file. + * + * Context: This should not be called while the inode is active as it + * is non-atomic. + */ +static inline void mapping_set_large_folios(struct address_space *mapping) +{ + __set_bit(AS_THP_SUPPORT, &mapping->flags); +} + static inline bool mapping_thp_support(struct address_space *mapping) { return test_bit(AS_THP_SUPPORT, &mapping->flags); -- cgit v1.2.3 From ed2145c474c9015bc634e35f6d1a9b7767f3fbfc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 29 Aug 2021 06:28:19 -0400 Subject: fs: Rename AS_THP_SUPPORT and mapping_thp_support These are now indicators of large folio support, not THP support. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/pagemap.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 9e33878bf23b..605246452305 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -84,7 +84,7 @@ enum mapping_flags { AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, - AS_THP_SUPPORT = 6, /* THPs supported */ + AS_LARGE_FOLIO_SUPPORT = 6, }; /** @@ -189,12 +189,12 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) */ static inline void mapping_set_large_folios(struct address_space *mapping) { - __set_bit(AS_THP_SUPPORT, &mapping->flags); + __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } -static inline bool mapping_thp_support(struct address_space *mapping) +static inline bool mapping_large_folio_support(struct address_space *mapping) { - return test_bit(AS_THP_SUPPORT, &mapping->flags); + return test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } static inline int filemap_nr_thps(struct address_space *mapping) @@ -209,7 +209,7 @@ static inline int filemap_nr_thps(struct address_space *mapping) static inline void filemap_nr_thps_inc(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS - if (!mapping_thp_support(mapping)) + if (!mapping_large_folio_support(mapping)) atomic_inc(&mapping->nr_thps); #else WARN_ON_ONCE(1); @@ -219,7 +219,7 @@ static inline void filemap_nr_thps_inc(struct address_space *mapping) static inline void filemap_nr_thps_dec(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS - if (!mapping_thp_support(mapping)) + if (!mapping_large_folio_support(mapping)) atomic_dec(&mapping->nr_thps); #else WARN_ON_ONCE(1); -- cgit v1.2.3 From 357a18ad230f0867791b788d2b1d6f280f6f6e61 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 15 Nov 2021 16:50:27 +0000 Subject: KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache In commit 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time / preempted status") I removed the only user of these functions because it was basically impossible to use them safely. There are two stages to the GFN->PFN mapping; first through the KVM memslots to a userspace HVA and then through the page tables to translate that HVA to an underlying PFN. Invalidations of the former were being handled correctly, but no attempt was made to use the MMU notifiers to invalidate the cache when the HVA->GFN mapping changed. As a prelude to reinventing the gfn_to_pfn_cache with more usable semantics, rip it out entirely and untangle the implementation of the unsafe kvm_vcpu_map()/kvm_vcpu_unmap() functions from it. All current users of kvm_vcpu_map() also look broken right now, and will be dealt with separately. They broadly fall into two classes: * Those which map, access the data and immediately unmap. This is mostly gratuitous and could just as well use the existing user HVA, and could probably benefit from a gfn_to_hva_cache as they do so. * Those which keep the mapping around for a longer time, perhaps even using the PFN directly from the guest. These will need to be converted to the new gfn_to_pfn_cache and then kvm_vcpu_map() can be removed too. Signed-off-by: David Woodhouse Message-Id: <20211115165030.7422-8-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 6 +----- include/linux/kvm_types.h | 7 ------- 2 files changed, 1 insertion(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 60a35d9fe259..eb625af4fc5e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -866,7 +866,7 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache); +void kvm_release_pfn(kvm_pfn_t pfn, bool dirty); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); @@ -942,12 +942,8 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); -int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, - struct gfn_to_pfn_cache *cache, bool atomic); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); -int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, - struct gfn_to_pfn_cache *cache, bool dirty, bool atomic); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 2237abb93ccd..234eab059839 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -53,13 +53,6 @@ struct gfn_to_hva_cache { struct kvm_memory_slot *memslot; }; -struct gfn_to_pfn_cache { - u64 generation; - gfn_t gfn; - kvm_pfn_t pfn; - bool dirty; -}; - #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE /* * Memory caches are used to preallocate memory ahead of various MMU flows, -- cgit v1.2.3 From f915b75bffb7257bd8d26376b8e1cc67771927f8 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 17 Nov 2021 15:56:52 +0800 Subject: page_pool: Revert "page_pool: disable dma mapping support..." This reverts commit d00e60ee54b12de945b8493cf18c1ada9e422514. As reported by Guillaume in [1]: Enabling LPAE always enables CONFIG_ARCH_DMA_ADDR_T_64BIT in 32-bit systems, which breaks the bootup proceess when a ethernet driver is using page pool with PP_FLAG_DMA_MAP flag. As we were hoping we had no active consumers for such system when we removed the dma mapping support, and LPAE seems like a common feature for 32 bits system, so revert it. 1. https://www.spinics.net/lists/netdev/msg779890.html Fixes: d00e60ee54b1 ("page_pool: disable dma mapping support for 32-bit arch with 64-bit DMA") Signed-off-by: Yunsheng Lin Reported-by: "kernelci.org bot" Tested-by: "kernelci.org bot" Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/linux/mm_types.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bb8c6f5f19bc..c3a6e6209600 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -105,7 +105,18 @@ struct page { struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; - atomic_long_t pp_frag_count; + union { + /** + * dma_addr_upper: might require a 64-bit + * value on 32-bit architectures. + */ + unsigned long dma_addr_upper; + /** + * For frag page support, not supported in + * 32-bit architectures with 64-bit DMA. + */ + atomic_long_t pp_frag_count; + }; }; struct { /* slab, slob and slub */ union { -- cgit v1.2.3 From 6966df483d7b5b218aeb0e13e7e334a8fc3c1744 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 18 Nov 2021 13:49:51 +0200 Subject: regulator: Update protection IRQ helper docs The documentation of IRQ notification helper had still references to first RFC implementation which called BUG() while trying to protect the hardware. Behaviour was improved as calling the BUG() was not a proper solution. Current implementation attempts to call poweroff if handling of potentially damaging error notification fails. Update the documentation to reflect the actual behaviour. Signed-off-by: Matti Vaittinen Link: https://lore.kernel.org/r/0c9cc4bcf20c3da66fd5a85c97ee4288e5727538.1637233864.git.matti.vaittinen@fi.rohmeurope.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index bd7a73db2e66..54cf566616ae 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -499,7 +499,8 @@ struct regulator_irq_data { * best to shut-down regulator(s) or reboot the SOC if error * handling is repeatedly failing. If fatal_cnt is given the IRQ * handling is aborted if it fails for fatal_cnt times and die() - * callback (if populated) or BUG() is called to try to prevent + * callback (if populated) is called. If die() is not populated + * poweroff for the system is attempted in order to prevent any * further damage. * @reread_ms: The time which is waited before attempting to re-read status * at the worker if IC reading fails. Immediate re-read is done @@ -516,11 +517,12 @@ struct regulator_irq_data { * @data: Driver private data pointer which will be passed as such to * the renable, map_event and die callbacks in regulator_irq_data. * @die: Protection callback. If IC status reading or recovery actions - * fail fatal_cnt times this callback or BUG() is called. This - * callback should implement a final protection attempt like - * disabling the regulator. If protection succeeded this may - * return 0. If anything else is returned the core assumes final - * protection failed and calls BUG() as a last resort. + * fail fatal_cnt times this callback is called or system is + * powered off. This callback should implement a final protection + * attempt like disabling the regulator. If protection succeeded + * die() may return 0. If anything else is returned the core + * assumes final protection failed and attempts to perform a + * poweroff as a last resort. * @map_event: Driver callback to map IRQ status into regulator devices with * events / errors. NOTE: callback MUST initialize both the * errors and notifs for all rdevs which it signals having -- cgit v1.2.3 From c035713998700e8843c7d087f55bce3c54c0e3ec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Nov 2021 10:19:05 -0400 Subject: mm: Add functions to zero portions of a folio These functions are wrappers around zero_user_segments(), which means that zero_user_segments() can now be called for compound pages even when CONFIG_TRANSPARENT_HUGEPAGE is disabled. Use 'xend' as the name of the parameter to indicate that this is an excluded end, not the more usual included end. Excluding the end makes more sense to the callers, but can cause confusion to readers who are more used to seeing included ends. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- include/linux/highmem.h | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index c944b3b70ee7..39bb9b47fa9c 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -230,10 +230,10 @@ static inline void tag_clear_highpage(struct page *page) * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. */ -#if defined(CONFIG_HIGHMEM) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +#ifdef CONFIG_HIGHMEM void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2); -#else /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */ +#else static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) @@ -253,7 +253,7 @@ static inline void zero_user_segments(struct page *page, for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } -#endif /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */ +#endif static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) @@ -363,4 +363,42 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) kunmap_local(addr); } +/** + * folio_zero_segments() - Zero two byte ranges in a folio. + * @folio: The folio to write to. + * @start1: The first byte to zero. + * @xend1: One more than the last byte in the first range. + * @start2: The first byte to zero in the second range. + * @xend2: One more than the last byte in the second range. + */ +static inline void folio_zero_segments(struct folio *folio, + size_t start1, size_t xend1, size_t start2, size_t xend2) +{ + zero_user_segments(&folio->page, start1, xend1, start2, xend2); +} + +/** + * folio_zero_segment() - Zero a byte range in a folio. + * @folio: The folio to write to. + * @start: The first byte to zero. + * @xend: One more than the last byte to zero. + */ +static inline void folio_zero_segment(struct folio *folio, + size_t start, size_t xend) +{ + zero_user_segments(&folio->page, start, xend, 0, 0); +} + +/** + * folio_zero_range() - Zero a byte range in a folio. + * @folio: The folio to write to. + * @start: The first byte to zero. + * @length: The number of bytes to zero. + */ +static inline void folio_zero_range(struct folio *folio, + size_t start, size_t length) +{ + zero_user_segments(&folio->page, start, start + length, 0, 0); +} + #endif /* _LINUX_HIGHMEM_H */ -- cgit v1.2.3 From fcb116bc43c8c37c052530ead79872f8b2615711 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Nov 2021 14:23:21 -0600 Subject: signal: Replace force_fatal_sig with force_exit_sig when in doubt Recently to prevent issues with SECCOMP_RET_KILL and similar signals being changed before they are delivered SA_IMMUTABLE was added. Unfortunately this broke debuggers[1][2] which reasonably expect to be able to trap synchronous SIGTRAP and SIGSEGV even when the target process is not configured to handle those signals. Add force_exit_sig and use it instead of force_fatal_sig where historically the code has directly called do_exit. This has the implementation benefits of going through the signal exit path (including generating core dumps) without the danger of allowing userspace to ignore or change these signals. This avoids userspace regressions as older kernels exited with do_exit which debuggers also can not intercept. In the future is should be possible to improve the quality of implementation of the kernel by changing some of these force_exit_sig calls to force_fatal_sig. That can be done where it matters on a case-by-case basis with careful analysis. Reported-by: Kyle Huey Reported-by: kernel test robot [1] https://lkml.kernel.org/r/CAP045AoMY4xf8aC_4QU_-j7obuEPYgTcnQQP3Yxk=2X90jtpjw@mail.gmail.com [2] https://lkml.kernel.org/r/20211117150258.GB5403@xsang-OptiPlex-9020 Fixes: 00b06da29cf9 ("signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed") Fixes: a3616a3c0272 ("signal/m68k: Use force_sigsegv(SIGSEGV) in fpsp040_die") Fixes: 83a1f27ad773 ("signal/powerpc: On swapcontext failure force SIGSEGV") Fixes: 9bc508cf0791 ("signal/s390: Use force_sigsegv in default_trap_handler") Fixes: 086ec444f866 ("signal/sparc32: In setup_rt_frame and setup_fram use force_fatal_sig") Fixes: c317d306d550 ("signal/sparc32: Exit with a fatal signal when try_to_clear_window_buffer fails") Fixes: 695dd0d634df ("signal/x86: In emulate_vsyscall force a signal instead of calling do_exit") Fixes: 1fbd60df8a85 ("signal/vm86_32: Properly send SIGSEGV when the vm86 state cannot be saved.") Fixes: 941edc5bf174 ("exit/syscall_user_dispatch: Send ordinary signals on failure") Link: https://lkml.kernel.org/r/871r3dqfv8.fsf_-_@email.froward.int.ebiederm.org Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Kyle Huey Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 23505394ef70..33a50642cf41 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -352,6 +352,7 @@ extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern void force_fatal_sig(int); +extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); -- cgit v1.2.3 From 85b6d24646e4125c591639841169baa98a2da503 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 19 Nov 2021 16:43:21 -0800 Subject: shm: extend forced shm destroy to support objects from several IPC nses Currently, the exit_shm() function not designed to work properly when task->sysvshm.shm_clist holds shm objects from different IPC namespaces. This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it leads to use-after-free (reproducer exists). This is an attempt to fix the problem by extending exit_shm mechanism to handle shm's destroy from several IPC ns'es. To achieve that we do several things: 1. add a namespace (non-refcounted) pointer to the struct shmid_kernel 2. during new shm object creation (newseg()/shmget syscall) we initialize this pointer by current task IPC ns 3. exit_shm() fully reworked such that it traverses over all shp's in task->sysvshm.shm_clist and gets IPC namespace not from current task as it was before but from shp's object itself, then call shm_destroy(shp, ns). Note: We need to be really careful here, because as it was said before (1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we using special helper get_ipc_ns_not_zero() which allows to get IPC ns refcounter only if IPC ns not in the "state of destruction". Q/A Q: Why can we access shp->ns memory using non-refcounted pointer? A: Because shp object lifetime is always shorther than IPC namespace lifetime, so, if we get shp object from the task->sysvshm.shm_clist while holding task_lock(task) nobody can steal our namespace. Q: Does this patch change semantics of unshare/setns/clone syscalls? A: No. It's just fixes non-covered case when process may leave IPC namespace without getting task->sysvshm.shm_clist list cleaned up. Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity") Co-developed-by: Manfred Spraul Signed-off-by: Manfred Spraul Signed-off-by: Alexander Mikhalitsyn Cc: "Eric W. Biederman" Cc: Davidlohr Bueso Cc: Greg KH Cc: Andrei Vagin Cc: Pavel Tikhomirov Cc: Vasily Averin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 15 +++++++++++++++ include/linux/sched/task.h | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 05e22770af51..b75395ec8d52 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -131,6 +131,16 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) return ns; } +static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ + if (ns) { + if (refcount_inc_not_zero(&ns->ns.count)) + return ns; + } + + return NULL; +} + extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, @@ -147,6 +157,11 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) return ns; } +static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ + return ns; +} + static inline void put_ipc_ns(struct ipc_namespace *ns) { } diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ba88a6987400..058d7f371e25 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -158,7 +158,7 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. And ->vfork_done. + * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), -- cgit v1.2.3 From afe041c2d0febd83698b8b0164e6b3b1dfae0b66 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Fri, 19 Nov 2021 16:43:40 -0800 Subject: hugetlb: fix hugetlb cgroup refcounting during mremap When hugetlb_vm_op_open() is called during copy_vma(), we may take the reference to resv_map->css. Later, when clearing the reservation pointer of old_vma after transferring it to new_vma, we forget to drop the reference to resv_map->css. This leads to a reference leak of css. Fixes this by adding a check to drop reservation css reference in clear_vma_resv_huge_pages() Link: https://lkml.kernel.org/r/20211113154412.91134-1-minhquangbui99@gmail.com Fixes: 550a7d60bd5e35 ("mm, hugepages: add mremap() support for hugepage backed vma") Signed-off-by: Bui Quang Minh Reviewed-by: Mike Kravetz Reviewed-by: Mina Almasry Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index c137396129db..ba025ae27882 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -128,6 +128,13 @@ static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( css_get(resv_map->css); } +static inline void resv_map_put_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ + if (resv_map->css) + css_put(resv_map->css); +} + extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, @@ -211,6 +218,11 @@ static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( { } +static inline void resv_map_put_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ +} + static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { -- cgit v1.2.3 From 985e9ece1e55a94da842f6c1f9ff84d587b26267 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 17 Nov 2021 20:07:35 +0200 Subject: ACPI: Make acpi_node_get_parent() local acpi_node_get_parent() isn't used outside drivers/acpi/property.c. Make it local. Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 668d007f0917..b28f8790192a 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1182,7 +1182,6 @@ int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child); -struct fwnode_handle *acpi_node_get_parent(const struct fwnode_handle *fwnode); struct acpi_probe_entry; typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *, @@ -1287,12 +1286,6 @@ acpi_get_next_subnode(const struct fwnode_handle *fwnode, return NULL; } -static inline struct fwnode_handle * -acpi_node_get_parent(const struct fwnode_handle *fwnode) -{ - return NULL; -} - static inline struct fwnode_handle * acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) -- cgit v1.2.3 From f124034faa911ed534bf8c4881ad98dbbde2a966 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Nov 2021 18:44:17 -0500 Subject: Revert "virtio_ring: validate used buffer length" This reverts commit 939779f5152d161b34f612af29e7dc1ac4472fcf. Attempts to validate length in the core did not work out: there turn out to exist multiple broken devices, and in particular legacy devices are known to be broken in this respect. We have ideas for handling this better in the next version but for now let's revert to a known good state to make sure drivers work for people. Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 44d0e09da2d9..41edbc01ffa4 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -152,7 +152,6 @@ size_t virtio_max_dma_size(struct virtio_device *vdev); * @feature_table_size: number of entries in the feature table array. * @feature_table_legacy: same as feature_table but when working in legacy mode. * @feature_table_size_legacy: number of entries in feature table legacy array. - * @suppress_used_validation: set to not have core validate used length * @probe: the function to call when a device is found. Returns 0 or -errno. * @scan: optional function to call after successful probe; intended * for virtio-scsi to invoke a scan. @@ -169,7 +168,6 @@ struct virtio_driver { unsigned int feature_table_size; const unsigned int *feature_table_legacy; unsigned int feature_table_size_legacy; - bool suppress_used_validation; int (*validate)(struct virtio_device *dev); int (*probe)(struct virtio_device *dev); void (*scan)(struct virtio_device *dev); -- cgit v1.2.3 From ec15baec3272bbec576f2ce7ce47765a8e9b7b1c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 26 Nov 2021 19:28:43 +0200 Subject: net: ptp: add a definition for the UDP port for IEEE 1588 general messages As opposed to event messages (Sync, PdelayReq etc) which require timestamping, general messages (Announce, FollowUp etc) do not. In PTP they are part of different streams of data. IEEE 1588-2008 Annex D.2 "UDP port numbers" states that the UDP destination port assigned by IANA is 319 for event messages, and 320 for general messages. Yet the kernel seems to be missing the definition for general messages. This patch adds it. Signed-off-by: Vladimir Oltean Acked-by: Richard Cochran Signed-off-by: Jakub Kicinski --- include/linux/ptp_classify.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index ae04968a3a47..9afd34a2d36c 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -37,6 +37,7 @@ #define PTP_MSGTYPE_PDELAY_RESP 0x3 #define PTP_EV_PORT 319 +#define PTP_GEN_PORT 320 #define PTP_GEN_BIT 0x08 /* indicates general message, if set in message type */ #define OFF_PTP_SOURCE_UUID 22 /* PTPv1 only */ -- cgit v1.2.3 From 6a2d2ddf2c345e0149bfbffdddc4768a9ab0a741 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Fri, 12 Nov 2021 14:32:27 +0100 Subject: drm: Move nomodeset kernel parameter to the DRM subsystem The "nomodeset" kernel cmdline parameter is handled by the vgacon driver but the exported vgacon_text_force() symbol is only used by DRM drivers. It makes much more sense for the parameter logic to be in the subsystem of the drivers that are making use of it. Let's move the vgacon_text_force() function and related logic to the DRM subsystem. While doing that, rename it to drm_firmware_drivers_only() and make it return true if "nomodeset" was used and false otherwise. This is a better description of the condition that the drivers are testing for. Suggested-by: Daniel Vetter Signed-off-by: Javier Martinez Canillas Acked-by: Thomas Zimmermann Acked-by: Jani Nikula Acked-by: Pekka Paalanen Acked-by: Greg Kroah-Hartman Link: https://patchwork.freedesktop.org/patch/msgid/20211112133230.1595307-4-javierm@redhat.com --- include/linux/console.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index a97f277cfdfa..7cd758a4f44e 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -219,12 +219,6 @@ extern atomic_t ignore_console_lock_warning; #define VESA_HSYNC_SUSPEND 2 #define VESA_POWERDOWN 3 -#ifdef CONFIG_VGA_CONSOLE -extern bool vgacon_text_force(void); -#else -static inline bool vgacon_text_force(void) { return false; } -#endif - extern void console_init(void); /* For deferred console takeover */ -- cgit v1.2.3 From f7e5b9bfa6c8820407b64eabc1f29c9a87e8993d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 29 Nov 2021 10:39:29 -0500 Subject: siphash: use _unaligned version by default On ARM v6 and later, we define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS because the ordinary load/store instructions (ldr, ldrh, ldrb) can tolerate any misalignment of the memory address. However, load/store double and load/store multiple instructions (ldrd, ldm) may still only be used on memory addresses that are 32-bit aligned, and so we have to use the CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS macro with care, or we may end up with a severe performance hit due to alignment traps that require fixups by the kernel. Testing shows that this currently happens with clang-13 but not gcc-11. In theory, any compiler version can produce this bug or other problems, as we are dealing with undefined behavior in C99 even on architectures that support this in hardware, see also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100363. Fortunately, the get_unaligned() accessors do the right thing: when building for ARMv6 or later, the compiler will emit unaligned accesses using the ordinary load/store instructions (but avoid the ones that require 32-bit alignment). When building for older ARM, those accessors will emit the appropriate sequence of ldrb/mov/orr instructions. And on architectures that can truly tolerate any kind of misalignment, the get_unaligned() accessors resolve to the leXX_to_cpup accessors that operate on aligned addresses. Since the compiler will in fact emit ldrd or ldm instructions when building this code for ARM v6 or later, the solution is to use the unaligned accessors unconditionally on architectures where this is known to be fast. The _aligned version of the hash function is however still needed to get the best performance on architectures that cannot do any unaligned access in hardware. This new version avoids the undefined behavior and should produce the fastest hash on all architectures we support. Link: https://lore.kernel.org/linux-arm-kernel/20181008211554.5355-4-ard.biesheuvel@linaro.org/ Link: https://lore.kernel.org/linux-crypto/CAK8P3a2KfmmGDbVHULWevB0hv71P2oi2ZCHEAqT=8dQfa0=cqQ@mail.gmail.com/ Reported-by: Ard Biesheuvel Fixes: 2c956a60778c ("siphash: add cryptographically secure PRF") Signed-off-by: Arnd Bergmann Reviewed-by: Jason A. Donenfeld Acked-by: Ard Biesheuvel Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- include/linux/siphash.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/siphash.h b/include/linux/siphash.h index bf21591a9e5e..0cda61855d90 100644 --- a/include/linux/siphash.h +++ b/include/linux/siphash.h @@ -27,9 +27,7 @@ static inline bool siphash_key_is_zero(const siphash_key_t *key) } u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); -#endif u64 siphash_1u64(const u64 a, const siphash_key_t *key); u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); @@ -82,10 +80,9 @@ static inline u64 ___siphash_aligned(const __le64 *data, size_t len, static inline u64 siphash(const void *data, size_t len, const siphash_key_t *key) { -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) return __siphash_unaligned(data, len, key); -#endif return ___siphash_aligned(data, len, key); } @@ -96,10 +93,8 @@ typedef struct { u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key); -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key); -#endif u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); @@ -135,10 +130,9 @@ static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, static inline u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key) { -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) return __hsiphash_unaligned(data, len, key); -#endif return ___hsiphash_aligned(data, len, key); } -- cgit v1.2.3 From 79364031c5b4365ca28ac0fa00acfab5bf465be1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 27 Nov 2021 17:32:00 +0100 Subject: bpf: Make sure bpf_disable_instrumentation() is safe vs preemption. The initial implementation of migrate_disable() for mainline was a wrapper around preempt_disable(). RT kernels substituted this with a real migrate disable implementation. Later on mainline gained true migrate disable support, but neither documentation nor affected code were updated. Remove stale comments claiming that migrate_disable() is PREEMPT_RT only. Don't use __this_cpu_inc() in the !PREEMPT_RT path because preemption is not disabled and the RMW operation can be preempted. Fixes: 74d862b682f51 ("sched: Make migrate_disable/enable() independent of RT") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211127163200.10466-3-bigeasy@linutronix.de --- include/linux/bpf.h | 16 ++-------------- include/linux/filter.h | 3 --- 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 84ff6ef49462..755f38e893be 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1353,28 +1353,16 @@ extern struct mutex bpf_stats_enabled_mutex; * kprobes, tracepoints) to prevent deadlocks on map operations as any of * these events can happen inside a region which holds a map bucket lock * and can deadlock on it. - * - * Use the preemption safe inc/dec variants on RT because migrate disable - * is preemptible on RT and preemption in the middle of the RMW operation - * might lead to inconsistent state. Use the raw variants for non RT - * kernels as migrate_disable() maps to preempt_disable() so the slightly - * more expensive save operation can be avoided. */ static inline void bpf_disable_instrumentation(void) { migrate_disable(); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - this_cpu_inc(bpf_prog_active); - else - __this_cpu_inc(bpf_prog_active); + this_cpu_inc(bpf_prog_active); } static inline void bpf_enable_instrumentation(void) { - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - this_cpu_dec(bpf_prog_active); - else - __this_cpu_dec(bpf_prog_active); + this_cpu_dec(bpf_prog_active); migrate_enable(); } diff --git a/include/linux/filter.h b/include/linux/filter.h index 24b7ed2677af..534f678ca50f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -640,9 +640,6 @@ static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void * This uses migrate_disable/enable() explicitly to document that the * invocation of a BPF program does not require reentrancy protection * against a BPF program which is invoked from a preempting task. - * - * For non RT enabled kernels migrate_disable/enable() maps to - * preempt_disable/enable(), i.e. it disables also preemption. */ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, const void *ctx) -- cgit v1.2.3 From 502e82b91361955c66c8453b5b7a905b0b5bd5a1 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 7 Nov 2021 17:21:45 +0200 Subject: net/mlx5: Fix access to a non-supported register Validate MRTC register is supported before triggering a delayed work which accesses it. Fixes: 5a1023deeed0 ("net/mlx5: Add periodic update of host time to firmware") Signed-off-by: Aya Levin Reviewed-by: Gal Pressman Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3636df90899a..fbaab440a484 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9698,7 +9698,10 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 regs_84_to_68[0x11]; u8 tracer_registers[0x4]; - u8 regs_63_to_32[0x20]; + u8 regs_63_to_46[0x12]; + u8 mrtc[0x1]; + u8 regs_44_to_32[0xd]; + u8 regs_31_to_0[0x20]; }; -- cgit v1.2.3 From 6bbfa44116689469267f1a6e3d233b52114139d2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 1 Dec 2021 23:45:50 +0900 Subject: kprobes: Limit max data_size of the kretprobe instances The 'kprobe::data_size' is unsigned, thus it can not be negative. But if user sets it enough big number (e.g. (size_t)-8), the result of 'data_size + sizeof(struct kretprobe_instance)' becomes smaller than sizeof(struct kretprobe_instance) or zero. In result, the kretprobe_instance are allocated without enough memory, and kretprobe accesses outside of allocated memory. To avoid this issue, introduce a max limitation of the kretprobe::data_size. 4KB per instance should be OK. Link: https://lkml.kernel.org/r/163836995040.432120.10322772773821182925.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: f47cd9b553aa ("kprobes: kretprobe user entry-handler") Reported-by: zhangyue Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e974caf39d3e..8c8f7a4d93af 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -153,6 +153,8 @@ struct kretprobe { struct kretprobe_holder *rph; }; +#define KRETPROBE_MAX_DATA_SIZE 4096 + struct kretprobe_instance { union { struct freelist_node freelist; -- cgit v1.2.3 From 7a10d8c810cfad3e79372d7d1c77899d86cd6662 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Nov 2021 09:01:55 -0800 Subject: net: annotate data-races on txq->xmit_lock_owner syzbot found that __dev_queue_xmit() is reading txq->xmit_lock_owner without annotations. No serious issue there, let's document what is happening there. BUG: KCSAN: data-race in __dev_queue_xmit / __dev_queue_xmit write to 0xffff888139d09484 of 4 bytes by interrupt on cpu 0: __netif_tx_unlock include/linux/netdevice.h:4437 [inline] __dev_queue_xmit+0x948/0xf70 net/core/dev.c:4229 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_hh_output include/net/neighbour.h:511 [inline] neigh_output include/net/neighbour.h:525 [inline] ip6_finish_output2+0x995/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x3e/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 read to 0xffff888139d09484 of 4 bytes by interrupt on cpu 1: __dev_queue_xmit+0x5e3/0xf70 net/core/dev.c:4213 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_resolve_output+0x3db/0x410 net/core/neighbour.c:1523 neigh_output include/net/neighbour.h:527 [inline] ip6_finish_output2+0x9be/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x8d/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 kcsan_setup_watchpoint+0x94/0x420 kernel/kcsan/core.c:443 folio_test_anon include/linux/page-flags.h:581 [inline] PageAnon include/linux/page-flags.h:586 [inline] zap_pte_range+0x5ac/0x10e0 mm/memory.c:1347 zap_pmd_range mm/memory.c:1467 [inline] zap_pud_range mm/memory.c:1496 [inline] zap_p4d_range mm/memory.c:1517 [inline] unmap_page_range+0x2dc/0x3d0 mm/memory.c:1538 unmap_single_vma+0x157/0x210 mm/memory.c:1583 unmap_vmas+0xd0/0x180 mm/memory.c:1615 exit_mmap+0x23d/0x470 mm/mmap.c:3170 __mmput+0x27/0x1b0 kernel/fork.c:1113 mmput+0x3d/0x50 kernel/fork.c:1134 exit_mm+0xdb/0x170 kernel/exit.c:507 do_exit+0x608/0x17a0 kernel/exit.c:819 do_group_exit+0xce/0x180 kernel/exit.c:929 get_signal+0xfc3/0x1550 kernel/signal.c:2852 arch_do_signal_or_restart+0x8c/0x2e0 arch/x86/kernel/signal.c:868 handle_signal_work kernel/entry/common.c:148 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x113/0x190 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0xffffffff Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 28712 Comm: syz-executor.0 Tainted: G W 5.16.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20211130170155.2331929-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec42495a43a..be5cb3360b94 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4404,7 +4404,8 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - txq->xmit_lock_owner = cpu; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, cpu); } static inline bool __netif_tx_acquire(struct netdev_queue *txq) @@ -4421,26 +4422,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - txq->xmit_lock_owner = smp_processor_id(); + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } static inline bool __netif_tx_trylock(struct netdev_queue *txq) { bool ok = spin_trylock(&txq->_xmit_lock); - if (likely(ok)) - txq->xmit_lock_owner = smp_processor_id(); + + if (likely(ok)) { + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); + } return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } -- cgit v1.2.3 From e7f2be115f0746b969c0df14c0d182f65f005ca5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Oct 2021 16:10:55 +0200 Subject: sched/cputime: Fix getrusage(RUSAGE_THREAD) with nohz_full getrusage(RUSAGE_THREAD) with nohz_full may return shorter utime/stime than the actual time. task_cputime_adjusted() snapshots utime and stime and then adjust their sum to match the scheduler maintained cputime.sum_exec_runtime. Unfortunately in nohz_full, sum_exec_runtime is only updated once per second in the worst case, causing a discrepancy against utime and stime that can be updated anytime by the reader using vtime. To fix this situation, perform an update of cputime.sum_exec_runtime when the cputime snapshot reports the task as actually running while the tick is disabled. The related overhead is then contained within the relevant situations. Reported-by: Hasegawa Hitomi Signed-off-by: Frederic Weisbecker Signed-off-by: Hasegawa Hitomi Signed-off-by: Thomas Gleixner Tested-by: Masayoshi Mizuma Acked-by: Phil Auld Link: https://lore.kernel.org/r/20211026141055.57358-3-frederic@kernel.org --- include/linux/sched/cputime.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h index 6c9f19a33865..ce3c58286062 100644 --- a/include/linux/sched/cputime.h +++ b/include/linux/sched/cputime.h @@ -18,15 +18,16 @@ #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern void task_cputime(struct task_struct *t, +extern bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime); extern u64 task_gtime(struct task_struct *t); #else -static inline void task_cputime(struct task_struct *t, +static inline bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { *utime = t->utime; *stime = t->stime; + return false; } static inline u64 task_gtime(struct task_struct *t) -- cgit v1.2.3 From f83baa0cb6cfc92ebaf7f9d3a99d7e34f2e77a8a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 1 Dec 2021 19:35:01 +0100 Subject: HID: add hid_is_usb() function to make it simpler for USB detection A number of HID drivers already call hid_is_using_ll_driver() but only for the detection of if this is a USB device or not. Make this more obvious by creating hid_is_usb() and calling the function that way. Also converts the existing hid_is_using_ll_driver() functions to use the new call. Cc: Jiri Kosina Cc: Benjamin Tissoires Cc: linux-input@vger.kernel.org Cc: stable@vger.kernel.org Tested-by: Benjamin Tissoires Signed-off-by: Greg Kroah-Hartman Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20211201183503.2373082-1-gregkh@linuxfoundation.org --- include/linux/hid.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 9e067f937dbc..f453be385bd4 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -840,6 +840,11 @@ static inline bool hid_is_using_ll_driver(struct hid_device *hdev, return hdev->ll_driver == driver; } +static inline bool hid_is_usb(struct hid_device *hdev) +{ + return hid_is_using_ll_driver(hdev, &usb_hid_driver); +} + #define PM_HINT_FULLON 1<<5 #define PM_HINT_NORMAL 1<<1 -- cgit v1.2.3 From d9847eb8be3d895b2b5f514fdf3885d47a0b92a2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 22 Nov 2021 20:17:40 +0530 Subject: bpf: Make CONFIG_DEBUG_INFO_BTF depend upon CONFIG_BPF_SYSCALL Vinicius Costa Gomes reported [0] that build fails when CONFIG_DEBUG_INFO_BTF is enabled and CONFIG_BPF_SYSCALL is disabled. This leads to btf.c not being compiled, and then no symbol being present in vmlinux for the declarations in btf.h. Since BTF is not useful without enabling BPF subsystem, disallow this combination. However, theoretically disabling both now could still fail, as the symbol for kfunc_btf_id_list variables is not available. This isn't a problem as the compiler usually optimizes the whole register/unregister call, but at lower optimization levels it can fail the build in linking stage. Fix that by adding dummy variables so that modules taking address of them still work, but the whole thing is a noop. [0]: https://lore.kernel.org/bpf/20211110205418.332403-1-vinicius.gomes@intel.com Fixes: 14f267d95fe4 ("bpf: btf: Introduce helpers for dynamic BTF set registration") Reported-by: Vinicius Costa Gomes Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211122144742.477787-2-memxor@gmail.com --- include/linux/btf.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 203eef993d76..0e1b6281fd8f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -245,7 +245,10 @@ struct kfunc_btf_id_set { struct module *owner; }; -struct kfunc_btf_id_list; +struct kfunc_btf_id_list { + struct list_head list; + struct mutex mutex; +}; #ifdef CONFIG_DEBUG_INFO_BTF_MODULES void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, @@ -254,6 +257,9 @@ void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, struct kfunc_btf_id_set *s); bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, struct module *owner); + +extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; +extern struct kfunc_btf_id_list prog_test_kfunc_list; #else static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, struct kfunc_btf_id_set *s) @@ -268,13 +274,13 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, { return false; } + +static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused; +static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused; #endif #define DEFINE_KFUNC_BTF_ID_SET(set, name) \ struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set), \ THIS_MODULE } -extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; -extern struct kfunc_btf_id_list prog_test_kfunc_list; - #endif -- cgit v1.2.3 From 8581fd402a0cf80b5298e3b225e7a7bd8f110e69 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 2 Dec 2021 12:34:00 -0800 Subject: treewide: Add missing includes masked by cgroup -> bpf dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup.h (therefore swap.h, therefore half of the universe) includes bpf.h which in turn includes module.h and slab.h. Since we're about to get rid of that dependency we need to clean things up. v2: drop the cpu.h include from cacheinfo.h, it's not necessary and it makes riscv sensitive to ordering of include files. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Reviewed-by: Christoph Hellwig Acked-by: Krzysztof WilczyƄski Acked-by: Peter Chen Acked-by: SeongJae Park Acked-by: Jani Nikula Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/all/20211120035253.72074-1-kuba@kernel.org/ # v1 Link: https://lore.kernel.org/all/20211120165528.197359-1-kuba@kernel.org/ # cacheinfo discussion Link: https://lore.kernel.org/bpf/20211202203400.1208663-1-kuba@kernel.org --- include/linux/cacheinfo.h | 1 - include/linux/device/driver.h | 1 + include/linux/filter.h | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 2f909ed084c6..4ff37cb763ae 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -3,7 +3,6 @@ #define _LINUX_CACHEINFO_H #include -#include #include #include diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index a498ebcf4993..15e7c5e15d62 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -18,6 +18,7 @@ #include #include #include +#include /** * enum probe_type - device driver probe type to try diff --git a/include/linux/filter.h b/include/linux/filter.h index 534f678ca50f..7f1e88e3e2b5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -6,6 +6,7 @@ #define __LINUX_FILTER_H__ #include +#include #include #include #include @@ -26,7 +27,6 @@ #include #include -#include struct sk_buff; struct sock; -- cgit v1.2.3 From a97770cc4016c2733bcef9dbe3d5b1ad02d13356 Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Mon, 6 Dec 2021 16:12:27 +0800 Subject: net: phy: Remove unnecessary indentation in the comments of phy_device Fix warning as: linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:543: WARNING: Unexpected indentation. linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:544: WARNING: Block quote ends without a blank line; unexpected unindent. linux-next/Documentation/networking/kapi:122: ./include/linux/phy.h:546: WARNING: Unexpected indentation. Suggested-by: Akira Yokosawa Signed-off-by: Yanteng Si Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 96e43fbb2dd8..cbf03a5f9cf5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -538,11 +538,12 @@ struct macsec_ops; * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. - * Bits [15:0] are free to use by the PHY driver to communicate - * driver specific behavior. - * Bits [23:16] are currently reserved for future use. - * Bits [31:24] are reserved for defining generic - * PHY driver behavior. + * + * - Bits [15:0] are free to use by the PHY driver to communicate + * driver specific behavior. + * - Bits [23:16] are currently reserved for future use. + * - Bits [31:24] are reserved for defining generic + * PHY driver behavior. * @irq: IRQ number of the PHY's interrupt (-1 if none) * @phy_timer: The timer for handling the state machine * @phylink: Pointer to phylink instance for this PHY -- cgit v1.2.3 From 444dd878e85fb33fcfb2682cfdab4c236f33ea3e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Dec 2021 17:19:47 +0100 Subject: PM: runtime: Fix pm_runtime_active() kerneldoc comment The kerneldoc comment of pm_runtime_active() does not reflect the behavior of the function, so update it accordingly. Fixes: 403d2d116ec0 ("PM: runtime: Add kerneldoc comments to multiple helpers") Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson --- include/linux/pm_runtime.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 222da43b7096..eddd66d426ca 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -129,7 +129,7 @@ static inline bool pm_runtime_suspended(struct device *dev) * pm_runtime_active - Check whether or not a device is runtime-active. * @dev: Target device. * - * Return %true if runtime PM is enabled for @dev and its runtime PM status is + * Return %true if runtime PM is disabled for @dev or its runtime PM status is * %RPM_ACTIVE, or %false otherwise. * * Note that the return value of this function can only be trusted if it is -- cgit v1.2.3 From cab2d3fd6866e089b5c50db09dece131f85bfebd Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Thu, 9 Dec 2021 18:46:33 +0530 Subject: bus: mhi: core: Add support for forced PM resume For whatever reason, some devices like QCA6390, WCN6855 using ath11k are not in M3 state during PM resume, but still functional. The mhi_pm_resume should then not fail in those cases, and let the higher level device specific stack continue resuming process. Add an API mhi_pm_resume_force(), to force resuming irrespective of the current MHI state. This fixes a regression with non functional ath11k WiFi after suspend/resume cycle on some machines. Bug report: https://bugzilla.kernel.org/show_bug.cgi?id=214179 Link: https://lore.kernel.org/regressions/871r5p0x2u.fsf@codeaurora.org/ Fixes: 020d3b26c07a ("bus: mhi: Early MHI resume failure in non M3 state") Cc: stable@vger.kernel.org #5.13 Reported-by: Kalle Valo Reported-by: Pengyu Ma Tested-by: Kalle Valo Acked-by: Kalle Valo Signed-off-by: Loic Poulain [mani: Switched to API, added bug report, reported-by tags and CCed stable] Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20211209131633.4168-1-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 723985879035..a5cc4cdf9cc8 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -663,6 +663,19 @@ int mhi_pm_suspend(struct mhi_controller *mhi_cntrl); */ int mhi_pm_resume(struct mhi_controller *mhi_cntrl); +/** + * mhi_pm_resume_force - Force resume MHI from suspended state + * @mhi_cntrl: MHI controller + * + * Resume the device irrespective of its MHI state. As per the MHI spec, devices + * has to be in M3 state during resume. But some devices seem to be in a + * different MHI state other than M3 but they continue working fine if allowed. + * This API is intented to be used for such devices. + * + * Return: 0 if the resume succeeds, a negative error code otherwise + */ +int mhi_pm_resume_force(struct mhi_controller *mhi_cntrl); + /** * mhi_download_rddm_image - Download ramdump image from device for * debugging purpose. -- cgit v1.2.3 From 42288cb44c4b5fff7653bc392b583a2b8bd6a8c0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 8 Dec 2021 17:04:51 -0800 Subject: wait: add wake_up_pollfree() Several ->poll() implementations are special in that they use a waitqueue whose lifetime is the current task, rather than the struct file as is normally the case. This is okay for blocking polls, since a blocking poll occurs within one task; however, non-blocking polls require another solution. This solution is for the queue to be cleared before it is freed, using 'wake_up_poll(wq, EPOLLHUP | POLLFREE);'. However, that has a bug: wake_up_poll() calls __wake_up() with nr_exclusive=1. Therefore, if there are multiple "exclusive" waiters, and the wakeup function for the first one returns a positive value, only that one will be called. That's *not* what's needed for POLLFREE; POLLFREE is special in that it really needs to wake up everyone. Considering the three non-blocking poll systems: - io_uring poll doesn't handle POLLFREE at all, so it is broken anyway. - aio poll is unaffected, since it doesn't support exclusive waits. However, that's fragile, as someone could add this feature later. - epoll doesn't appear to be broken by this, since its wakeup function returns 0 when it sees POLLFREE. But this is fragile. Although there is a workaround (see epoll), it's better to define a function which always sends POLLFREE to all waiters. Add such a function. Also make it verify that the queue really becomes empty after all waiters have been woken up. Reported-by: Linus Torvalds Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211209010455.42744-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/wait.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 2d0df57c9902..851e07da2583 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); +void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) @@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); #define wake_up_interruptible_sync_poll_locked(x, m) \ __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) +/** + * wake_up_pollfree - signal that a polled waitqueue is going away + * @wq_head: the wait queue head + * + * In the very rare cases where a ->poll() implementation uses a waitqueue whose + * lifetime is tied to a task rather than to the 'struct file' being polled, + * this function must be called before the waitqueue is freed so that + * non-blocking polls (e.g. epoll) are notified that the queue is going away. + * + * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via + * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU. + */ +static inline void wake_up_pollfree(struct wait_queue_head *wq_head) +{ + /* + * For performance reasons, we don't always take the queue lock here. + * Therefore, we might race with someone removing the last entry from + * the queue, and proceed while they still hold the queue lock. + * However, rcu_read_lock() is required to be held in such cases, so we + * can safely proceed with an RCU-delayed free. + */ + if (waitqueue_active(wq_head)) + __wake_up_pollfree(wq_head); +} + #define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ -- cgit v1.2.3 From a4f1192cb53758a7210ed5a9ee695aeba22f75fb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 9 Dec 2021 14:30:33 +0200 Subject: percpu_ref: Replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Signed-off-by: Andy Shevchenko Signed-off-by: Dennis Zhou --- include/linux/percpu-refcount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index b31d3f3312ce..d73a1c08c3e3 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -51,9 +51,9 @@ #define _LINUX_PERCPU_REFCOUNT_H #include -#include #include #include +#include #include struct percpu_ref; -- cgit v1.2.3 From e4779015fd5d2fb8390c258268addff24d6077c7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Dec 2021 14:46:22 -0800 Subject: timers: implement usleep_idle_range() Patch series "mm/damon: Fix fake /proc/loadavg reports", v3. This patchset fixes DAMON's fake load report issue. The first patch makes yet another variant of usleep_range() for this fix, and the second patch fixes the issue of DAMON by making it using the newly introduced function. This patch (of 2): Some kernel threads such as DAMON could need to repeatedly sleep in micro seconds level. Because usleep_range() sleeps in uninterruptible state, however, such threads would make /proc/loadavg reports fake load. To help such cases, this commit implements a variant of usleep_range() called usleep_idle_range(). It is same to usleep_range() but sets the state of the current task as TASK_IDLE while sleeping. Link: https://lkml.kernel.org/r/20211126145015.15862-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211126145015.15862-2-sj@kernel.org Signed-off-by: SeongJae Park Suggested-by: Andrew Morton Reviewed-by: Thomas Gleixner Tested-by: Oleksandr Natalenko Cc: John Stultz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delay.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/delay.h b/include/linux/delay.h index 8eacf67eb212..039e7e0c7378 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -20,6 +20,7 @@ */ #include +#include extern unsigned long loops_per_jiffy; @@ -58,7 +59,18 @@ void calibrate_delay(void); void __attribute__((weak)) calibration_delay_done(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); -void usleep_range(unsigned long min, unsigned long max); +void usleep_range_state(unsigned long min, unsigned long max, + unsigned int state); + +static inline void usleep_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); +} + +static inline void usleep_idle_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_IDLE); +} static inline void ssleep(unsigned int seconds) { -- cgit v1.2.3