From 63c15822b8dd02a2423cfd92232245ace3f7a11b Mon Sep 17 00:00:00 2001 From: Syed Nayyar Waris Date: Wed, 27 Mar 2024 16:23:38 +0100 Subject: lib/bitmap: add bitmap_{read,write}() The two new functions allow reading/writing values of length up to BITS_PER_LONG bits at arbitrary position in the bitmap. The code was taken from "bitops: Introduce the for_each_set_clump macro" by Syed Nayyar Waris with a number of changes and simplifications: - instead of using roundup(), which adds an unnecessary dependency on , we calculate space as BITS_PER_LONG-offset; - indentation is reduced by not using else-clauses (suggested by checkpatch for bitmap_get_value()); - bitmap_get_value()/bitmap_set_value() are renamed to bitmap_read() and bitmap_write(); - some redundant computations are omitted. Cc: Arnd Bergmann Signed-off-by: Syed Nayyar Waris Signed-off-by: William Breathitt Gray Link: https://lore.kernel.org/lkml/fe12eedf3666f4af5138de0e70b67a07c7f40338.1592224129.git.syednwaris@gmail.com/ Suggested-by: Yury Norov Co-developed-by: Alexander Potapenko Signed-off-by: Alexander Potapenko Reviewed-by: Andy Shevchenko Acked-by: Yury Norov Signed-off-by: Yury Norov Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/bitmap.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index aa4096126553..914c23e96f26 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -83,6 +83,10 @@ struct device; * bitmap_to_arr64(buf, src, nbits) Copy nbits from buf to u64[] dst * bitmap_get_value8(map, start) Get 8bit value from map at start * bitmap_set_value8(map, value, start) Set 8bit value to map at start + * bitmap_read(map, start, nbits) Read an nbits-sized value from + * map at start + * bitmap_write(map, value, start, nbits) Write an nbits-sized value to + * map at start * * Note, bitmap_zero() and bitmap_fill() operate over the region of * unsigned longs, that is, bits behind bitmap till the unsigned long @@ -754,6 +758,79 @@ static inline void bitmap_set_value8(unsigned long *map, unsigned long value, map[index] |= value << offset; } +/** + * bitmap_read - read a value of n-bits from the memory region + * @map: address to the bitmap memory region + * @start: bit offset of the n-bit value + * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG + * + * Returns: value of @nbits bits located at the @start bit offset within the + * @map memory region. For @nbits = 0 and @nbits > BITS_PER_LONG the return + * value is undefined. + */ +static inline unsigned long bitmap_read(const unsigned long *map, + unsigned long start, + unsigned long nbits) +{ + size_t index = BIT_WORD(start); + unsigned long offset = start % BITS_PER_LONG; + unsigned long space = BITS_PER_LONG - offset; + unsigned long value_low, value_high; + + if (unlikely(!nbits || nbits > BITS_PER_LONG)) + return 0; + + if (space >= nbits) + return (map[index] >> offset) & BITMAP_LAST_WORD_MASK(nbits); + + value_low = map[index] & BITMAP_FIRST_WORD_MASK(start); + value_high = map[index + 1] & BITMAP_LAST_WORD_MASK(start + nbits); + return (value_low >> offset) | (value_high << space); +} + +/** + * bitmap_write - write n-bit value within a memory region + * @map: address to the bitmap memory region + * @value: value to write, clamped to nbits + * @start: bit offset of the n-bit value + * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG. + * + * bitmap_write() behaves as-if implemented as @nbits calls of __assign_bit(), + * i.e. bits beyond @nbits are ignored: + * + * for (bit = 0; bit < nbits; bit++) + * __assign_bit(start + bit, bitmap, val & BIT(bit)); + * + * For @nbits == 0 and @nbits > BITS_PER_LONG no writes are performed. + */ +static inline void bitmap_write(unsigned long *map, unsigned long value, + unsigned long start, unsigned long nbits) +{ + size_t index; + unsigned long offset; + unsigned long space; + unsigned long mask; + bool fit; + + if (unlikely(!nbits || nbits > BITS_PER_LONG)) + return; + + mask = BITMAP_LAST_WORD_MASK(nbits); + value &= mask; + offset = start % BITS_PER_LONG; + space = BITS_PER_LONG - offset; + fit = space >= nbits; + index = BIT_WORD(start); + + map[index] &= (fit ? (~(mask << offset)) : ~BITMAP_FIRST_WORD_MASK(start)); + map[index] |= value << offset; + if (fit) + return; + + map[index + 1] &= BITMAP_FIRST_WORD_MASK(start + nbits); + map[index + 1] |= (value >> space); +} + #endif /* __ASSEMBLY__ */ #endif /* __LINUX_BITMAP_H */ -- cgit v1.2.3 From 72cc1980a0ef3ccad0d539e7dace63d0d7d432a4 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:41 +0100 Subject: bitops: add missing prototype check Commit 8238b4579866 ("wait_on_bit: add an acquire memory barrier") added a new bitop, test_bit_acquire(), with proper wrapping in order to try to optimize it at compile-time, but missed the list of bitops used for checking their prototypes a bit below. The functions added have consistent prototypes, so that no more changes are required and no functional changes take place. Fixes: 8238b4579866 ("wait_on_bit: add an acquire memory barrier") Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/bitops.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 2ba557e067fe..f7f5a783da2a 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -80,6 +80,7 @@ __check_bitop_pr(__test_and_set_bit); __check_bitop_pr(__test_and_clear_bit); __check_bitop_pr(__test_and_change_bit); __check_bitop_pr(test_bit); +__check_bitop_pr(test_bit_acquire); #undef __check_bitop_pr -- cgit v1.2.3 From 7d8296b250f2eed73f1758607926d4d258dea5d4 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:42 +0100 Subject: bitops: make BYTES_TO_BITS() treewide-available Avoid open-coding that simple expression each time by moving BYTES_TO_BITS() from the probes code to to export it to the rest of the kernel. Simplify the macro while at it. `BITS_PER_LONG / sizeof(long)` always equals to %BITS_PER_BYTE, regardless of the target architecture. Do the same for the tools ecosystem as well (incl. its version of bitops.h). The previous implementation had its implicit type of long, while the new one is int, so adjust the format literal accordingly in the perf code. Suggested-by: Andy Shevchenko Reviewed-by: Przemek Kitszel Acked-by: Yury Norov Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/bitops.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index f7f5a783da2a..e0cd09eb91cd 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -21,6 +21,8 @@ #define BITS_TO_U32(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) +#define BYTES_TO_BITS(nb) ((nb) * BITS_PER_BYTE) + extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); -- cgit v1.2.3 From 5259401ef8f4b010bc0f9740868e9147ccc45899 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:43 +0100 Subject: bitops: let the compiler optimize {__,}assign_bit() Since commit b03fc1173c0c ("bitops: let optimize out non-atomic bitops on compile-time constants"), the compilers are able to expand inline bitmap operations to compile-time initializers when possible. However, during the round of replacement if-__set-else-__clear with __assign_bit() as per Andy's advice, bloat-o-meter showed +1024 bytes difference in object code size for one module (even one function), where the pattern: DECLARE_BITMAP(foo) = { }; // on the stack, zeroed if (a) __set_bit(const_bit_num, foo); if (b) __set_bit(another_const_bit_num, foo); ... is heavily used, although there should be no difference: the bitmap is zeroed, so the second half of __assign_bit() should be compiled-out as a no-op. I either missed the fact that __assign_bit() has bitmap pointer marked as `volatile` (as we usually do for bitops) or was hoping that the compilers would at least try to look past the `volatile` for __always_inline functions. Anyhow, due to that attribute, the compilers were always compiling the whole expression and no mentioned compile-time optimizations were working. Convert __assign_bit() to a macro since it's a very simple if-else and all of the checks are performed inside __set_bit() and __clear_bit(), thus that wrapper has to be as transparent as possible. After that change, despite it showing only -20 bytes change for vmlinux (due to that it's still relatively unpopular), no drastic code size changes happen when replacing if-set-else-clear for onstack bitmaps with __assign_bit(), meaning the compiler now expands them to the actual operations will all the expected optimizations. Atomic assign_bit() is less affected due to its nature, but let's convert it to a macro as well to keep the code consistent and not leave a place for possible suboptimal codegen. Moreover, with certain kernel configuration it actually gives some saves (x86): do_ip_setsockopt 4154 4099 -55 Suggested-by: Yury Norov # assign_bit(), too Cc: Andy Shevchenko Reviewed-by: Przemek Kitszel Acked-by: Yury Norov Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/bitops.h | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index e0cd09eb91cd..b25dc8742124 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -275,23 +275,11 @@ static inline unsigned long fns(unsigned long word, unsigned int n) * @addr: the address to start counting from * @value: the value to assign */ -static __always_inline void assign_bit(long nr, volatile unsigned long *addr, - bool value) -{ - if (value) - set_bit(nr, addr); - else - clear_bit(nr, addr); -} +#define assign_bit(nr, addr, value) \ + ((value) ? set_bit((nr), (addr)) : clear_bit((nr), (addr))) -static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, - bool value) -{ - if (value) - __set_bit(nr, addr); - else - __clear_bit(nr, addr); -} +#define __assign_bit(nr, addr, value) \ + ((value) ? __set_bit((nr), (addr)) : __clear_bit((nr), (addr))) /** * __ptr_set_bit - Set bit in a pointer's value -- cgit v1.2.3 From 8fab6a9d72e4fa0b0607b3a8011a909cabae79f7 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:44 +0100 Subject: linkmode: convert linkmode_{test,set,clear,mod}_bit() to macros Since commit b03fc1173c0c ("bitops: let optimize out non-atomic bitops on compile-time constants"), the non-atomic bitops are macros which can be expanded by the compilers into compile-time expressions, which will result in better optimized object code. Unfortunately, turned out that passing `volatile` to those macros discards any possibility of optimization, as the compilers then don't even try to look whether the passed bitmap is known at compilation time. In addition to that, the mentioned linkmode helpers are marked with `inline`, not `__always_inline`, meaning that it's not guaranteed some compiler won't uninline them for no reason, which will also effectively prevent them from being optimized (it's a well-known thing the compilers sometimes uninline `2 + 2`). Convert linkmode_*_bit() from inlines to macros. Their calling convention are 1:1 with the corresponding bitops, so that it's not even needed to enumerate and map the arguments, only the names. No changes in vmlinux' object code (compiled by LLVM for x86_64) whatsoever, but that doesn't necessarily means the change is meaningless. Reviewed-by: Przemek Kitszel Acked-by: Jakub Kicinski Acked-by: Yury Norov Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/linkmode.h | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h index 287f590ed56b..d94bfd9ac8cc 100644 --- a/include/linux/linkmode.h +++ b/include/linux/linkmode.h @@ -43,29 +43,10 @@ static inline int linkmode_andnot(unsigned long *dst, const unsigned long *src1, return bitmap_andnot(dst, src1, src2, __ETHTOOL_LINK_MODE_MASK_NBITS); } -static inline void linkmode_set_bit(int nr, volatile unsigned long *addr) -{ - __set_bit(nr, addr); -} - -static inline void linkmode_clear_bit(int nr, volatile unsigned long *addr) -{ - __clear_bit(nr, addr); -} - -static inline void linkmode_mod_bit(int nr, volatile unsigned long *addr, - int set) -{ - if (set) - linkmode_set_bit(nr, addr); - else - linkmode_clear_bit(nr, addr); -} - -static inline int linkmode_test_bit(int nr, const volatile unsigned long *addr) -{ - return test_bit(nr, addr); -} +#define linkmode_test_bit test_bit +#define linkmode_set_bit __set_bit +#define linkmode_clear_bit __clear_bit +#define linkmode_mod_bit __assign_bit static inline void linkmode_set_bit_array(const int *array, int array_size, unsigned long *addr) -- cgit v1.2.3 From a37fbe666c016fd89e4460d0ebfcea05baba46dc Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:49 +0100 Subject: bitmap: introduce generic optimized bitmap_size() The number of times yet another open coded `BITS_TO_LONGS(nbits) * sizeof(long)` can be spotted is huge. Some generic helper is long overdue. Add one, bitmap_size(), but with one detail. BITS_TO_LONGS() uses DIV_ROUND_UP(). The latter works well when both divident and divisor are compile-time constants or when the divisor is not a pow-of-2. When it is however, the compilers sometimes tend to generate suboptimal code (GCC 13): 48 83 c0 3f add $0x3f,%rax 48 c1 e8 06 shr $0x6,%rax 48 8d 14 c5 00 00 00 00 lea 0x0(,%rax,8),%rdx %BITS_PER_LONG is always a pow-2 (either 32 or 64), but GCC still does full division of `nbits + 63` by it and then multiplication by 8. Instead of BITS_TO_LONGS(), use ALIGN() and then divide by 8. GCC: 8d 50 3f lea 0x3f(%rax),%edx c1 ea 03 shr $0x3,%edx 81 e2 f8 ff ff 1f and $0x1ffffff8,%edx Now it shifts `nbits + 63` by 3 positions (IOW performs fast division by 8) and then masks bits[2:0]. bloat-o-meter: add/remove: 0/0 grow/shrink: 20/133 up/down: 156/-773 (-617) Clang does it better and generates the same code before/after starting from -O1, except that with the ALIGN() approach it uses %edx and thus still saves some bytes: add/remove: 0/0 grow/shrink: 9/133 up/down: 18/-538 (-520) Note that we can't expand DIV_ROUND_UP() by adding a check and using this approach there, as it's used in array declarations where expressions are not allowed. Add this helper to tools/ as well. Reviewed-by: Przemek Kitszel Acked-by: Yury Norov Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/bitmap.h | 8 +++++--- include/linux/cpumask.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 914c23e96f26..363e0b184a45 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -226,9 +226,11 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) +#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) + static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { - unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = 0; @@ -238,7 +240,7 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { - unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = ~0UL; @@ -249,7 +251,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits) { - unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + unsigned int len = bitmap_size(nbits); if (small_const_nbits(nbits)) *dst = *src; diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 1c29947db848..6519f9c77709 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -853,7 +853,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) */ static inline unsigned int cpumask_size(void) { - return BITS_TO_LONGS(large_cpumask_bits) * sizeof(long); + return bitmap_size(large_cpumask_bits); } /* -- cgit v1.2.3 From b44759705f7dc95d539d145b4c2edcaf079e7c33 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:50 +0100 Subject: bitmap: make bitmap_{get,set}_value8() use bitmap_{read,write}() Now that we have generic bitmap_read() and bitmap_write(), which are inline and try to take care of non-bound-crossing and aligned cases to keep them optimized, collapse bitmap_{get,set}_value8() into simple wrappers around the former ones. bloat-o-meter shows no difference in vmlinux and -2 bytes for gpio-pca953x.ko, which says the optimization didn't suffer due to that change. The converted helpers have the value width embedded and always compile-time constant and that helps a lot. Suggested-by: Yury Norov Signed-off-by: Yury Norov Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/bitmap.h | 38 +++++--------------------------------- 1 file changed, 5 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 363e0b184a45..8c4768c44a01 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -727,39 +727,6 @@ static inline void bitmap_from_u64(unsigned long *dst, u64 mask) bitmap_from_arr64(dst, &mask, 64); } -/** - * bitmap_get_value8 - get an 8-bit value within a memory region - * @map: address to the bitmap memory region - * @start: bit offset of the 8-bit value; must be a multiple of 8 - * - * Returns the 8-bit value located at the @start bit offset within the @src - * memory region. - */ -static inline unsigned long bitmap_get_value8(const unsigned long *map, - unsigned long start) -{ - const size_t index = BIT_WORD(start); - const unsigned long offset = start % BITS_PER_LONG; - - return (map[index] >> offset) & 0xFF; -} - -/** - * bitmap_set_value8 - set an 8-bit value within a memory region - * @map: address to the bitmap memory region - * @value: the 8-bit value; values wider than 8 bits may clobber bitmap - * @start: bit offset of the 8-bit value; must be a multiple of 8 - */ -static inline void bitmap_set_value8(unsigned long *map, unsigned long value, - unsigned long start) -{ - const size_t index = BIT_WORD(start); - const unsigned long offset = start % BITS_PER_LONG; - - map[index] &= ~(0xFFUL << offset); - map[index] |= value << offset; -} - /** * bitmap_read - read a value of n-bits from the memory region * @map: address to the bitmap memory region @@ -833,6 +800,11 @@ static inline void bitmap_write(unsigned long *map, unsigned long value, map[index + 1] |= (value >> space); } +#define bitmap_get_value8(map, start) \ + bitmap_read(map, start, BITS_PER_BYTE) +#define bitmap_set_value8(map, value, start) \ + bitmap_write(map, value, start, BITS_PER_BYTE) + #endif /* __ASSEMBLY__ */ #endif /* __LINUX_BITMAP_H */ -- cgit v1.2.3 From 117aef12a7b1b797bce9f66b156c65eab850b5b5 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 16:23:52 +0100 Subject: ip_tunnel: use a separate struct to store tunnel params in the kernel Unlike IPv6 tunnels which use purely-kernel __ip6_tnl_parm structure to store params inside the kernel, IPv4 tunnel code uses the same ip_tunnel_parm which is being used to talk with the userspace. This makes it difficult to alter or add any fields or use a different format for whatever data. Define struct ip_tunnel_parm_kern, a 1:1 copy of ip_tunnel_parm for now, and use it throughout the code. Define the pieces, where the copy user <-> kernel happens, as standalone functions, and copy the data there field-by-field, so that the kernel-side structure could be easily modified later on and the users wouldn't have to care about this. Reviewed-by: Simon Horman Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e41d30ebaca6..55c7cf9404a4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -59,7 +59,7 @@ struct ethtool_ops; struct kernel_hwtstamp_config; struct phy_device; struct dsa_port; -struct ip_tunnel_parm; +struct ip_tunnel_parm_kern; struct macsec_context; struct macsec_ops; struct netdev_name_node; @@ -1327,7 +1327,7 @@ struct netdev_net_notifier { * queue id bound to an AF_XDP socket. The flags field specifies if * only RX, only Tx, or both should be woken up using the flags * XDP_WAKEUP_RX and XDP_WAKEUP_TX. - * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, + * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); @@ -1583,7 +1583,8 @@ struct net_device_ops { int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); int (*ndo_tunnel_ctl)(struct net_device *dev, - struct ip_tunnel_parm *p, int cmd); + struct ip_tunnel_parm_kern *p, + int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); -- cgit v1.2.3