From 74d23cc704d19732e70ef1579a669f7d5f09dd9a Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Sun, 21 Dec 2014 19:46:56 +0100 Subject: time: move the timecounter/cyclecounter code into its own file. The timecounter code has almost nothing to do with the clocksource code. Let it live in its own file. This will help isolate the timecounter users from the clocksource users in the source tree. Signed-off-by: Richard Cochran Acked-by: Jeff Kirsher Signed-off-by: David S. Miller --- include/linux/clocksource.h | 102 ------------------------------------ include/linux/mlx4/device.h | 2 +- include/linux/timecounter.h | 122 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/types.h | 3 ++ 4 files changed, 126 insertions(+), 103 deletions(-) create mode 100644 include/linux/timecounter.h (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index abcafaa20b86..9c78d15d33e4 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -18,8 +18,6 @@ #include #include -/* clocksource cycle base type */ -typedef u64 cycle_t; struct clocksource; struct module; @@ -27,106 +25,6 @@ struct module; #include #endif -/** - * struct cyclecounter - hardware abstraction for a free running counter - * Provides completely state-free accessors to the underlying hardware. - * Depending on which hardware it reads, the cycle counter may wrap - * around quickly. Locking rules (if necessary) have to be defined - * by the implementor and user of specific instances of this API. - * - * @read: returns the current cycle value - * @mask: bitmask for two's complement - * subtraction of non 64 bit counters, - * see CLOCKSOURCE_MASK() helper macro - * @mult: cycle to nanosecond multiplier - * @shift: cycle to nanosecond divisor (power of two) - */ -struct cyclecounter { - cycle_t (*read)(const struct cyclecounter *cc); - cycle_t mask; - u32 mult; - u32 shift; -}; - -/** - * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds - * Contains the state needed by timecounter_read() to detect - * cycle counter wrap around. Initialize with - * timecounter_init(). Also used to convert cycle counts into the - * corresponding nanosecond counts with timecounter_cyc2time(). Users - * of this code are responsible for initializing the underlying - * cycle counter hardware, locking issues and reading the time - * more often than the cycle counter wraps around. The nanosecond - * counter will only wrap around after ~585 years. - * - * @cc: the cycle counter used by this instance - * @cycle_last: most recent cycle counter value seen by - * timecounter_read() - * @nsec: continuously increasing count - */ -struct timecounter { - const struct cyclecounter *cc; - cycle_t cycle_last; - u64 nsec; -}; - -/** - * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds - * @cc: Pointer to cycle counter. - * @cycles: Cycles - * - * XXX - This could use some mult_lxl_ll() asm optimization. Same code - * as in cyc2ns, but with unsigned result. - */ -static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, - cycle_t cycles) -{ - u64 ret = (u64)cycles; - ret = (ret * cc->mult) >> cc->shift; - return ret; -} - -/** - * timecounter_init - initialize a time counter - * @tc: Pointer to time counter which is to be initialized/reset - * @cc: A cycle counter, ready to be used. - * @start_tstamp: Arbitrary initial time stamp. - * - * After this call the current cycle register (roughly) corresponds to - * the initial time stamp. Every call to timecounter_read() increments - * the time stamp counter by the number of elapsed nanoseconds. - */ -extern void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, - u64 start_tstamp); - -/** - * timecounter_read - return nanoseconds elapsed since timecounter_init() - * plus the initial time stamp - * @tc: Pointer to time counter. - * - * In other words, keeps track of time since the same epoch as - * the function which generated the initial time stamp. - */ -extern u64 timecounter_read(struct timecounter *tc); - -/** - * timecounter_cyc2time - convert a cycle counter to same - * time base as values returned by - * timecounter_read() - * @tc: Pointer to time counter. - * @cycle_tstamp: a value returned by tc->cc->read() - * - * Cycle counts that are converted correctly as long as they - * fall into the interval [-1/2 max cycle count, +1/2 max cycle count], - * with "max cycle count" == cs->mask+1. - * - * This allows conversion of cycle counter values which were generated - * in the past. - */ -extern u64 timecounter_cyc2time(struct timecounter *tc, - cycle_t cycle_tstamp); - /** * struct clocksource - hardware abstraction for a free running counter * Provides mostly state-free accessors to the underlying hardware. diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 25c791e295fd..f1e41b33462f 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -42,7 +42,7 @@ #include -#include +#include #define MAX_MSIX_P_PORT 17 #define MAX_MSIX 64 diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h new file mode 100644 index 000000000000..146f07a6651b --- /dev/null +++ b/include/linux/timecounter.h @@ -0,0 +1,122 @@ +/* + * linux/include/linux/timecounter.h + * + * based on code that migrated away from + * linux/include/linux/clocksource.h + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#ifndef _LINUX_TIMECOUNTER_H +#define _LINUX_TIMECOUNTER_H + +#include + +/** + * struct cyclecounter - hardware abstraction for a free running counter + * Provides completely state-free accessors to the underlying hardware. + * Depending on which hardware it reads, the cycle counter may wrap + * around quickly. Locking rules (if necessary) have to be defined + * by the implementor and user of specific instances of this API. + * + * @read: returns the current cycle value + * @mask: bitmask for two's complement + * subtraction of non 64 bit counters, + * see CLOCKSOURCE_MASK() helper macro + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + */ +struct cyclecounter { + cycle_t (*read)(const struct cyclecounter *cc); + cycle_t mask; + u32 mult; + u32 shift; +}; + +/** + * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds + * Contains the state needed by timecounter_read() to detect + * cycle counter wrap around. Initialize with + * timecounter_init(). Also used to convert cycle counts into the + * corresponding nanosecond counts with timecounter_cyc2time(). Users + * of this code are responsible for initializing the underlying + * cycle counter hardware, locking issues and reading the time + * more often than the cycle counter wraps around. The nanosecond + * counter will only wrap around after ~585 years. + * + * @cc: the cycle counter used by this instance + * @cycle_last: most recent cycle counter value seen by + * timecounter_read() + * @nsec: continuously increasing count + */ +struct timecounter { + const struct cyclecounter *cc; + cycle_t cycle_last; + u64 nsec; +}; + +/** + * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds + * @cc: Pointer to cycle counter. + * @cycles: Cycles + * + * XXX - This could use some mult_lxl_ll() asm optimization. Same code + * as in cyc2ns, but with unsigned result. + */ +static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, + cycle_t cycles) +{ + u64 ret = (u64)cycles; + ret = (ret * cc->mult) >> cc->shift; + return ret; +} + +/** + * timecounter_init - initialize a time counter + * @tc: Pointer to time counter which is to be initialized/reset + * @cc: A cycle counter, ready to be used. + * @start_tstamp: Arbitrary initial time stamp. + * + * After this call the current cycle register (roughly) corresponds to + * the initial time stamp. Every call to timecounter_read() increments + * the time stamp counter by the number of elapsed nanoseconds. + */ +extern void timecounter_init(struct timecounter *tc, + const struct cyclecounter *cc, + u64 start_tstamp); + +/** + * timecounter_read - return nanoseconds elapsed since timecounter_init() + * plus the initial time stamp + * @tc: Pointer to time counter. + * + * In other words, keeps track of time since the same epoch as + * the function which generated the initial time stamp. + */ +extern u64 timecounter_read(struct timecounter *tc); + +/** + * timecounter_cyc2time - convert a cycle counter to same + * time base as values returned by + * timecounter_read() + * @tc: Pointer to time counter. + * @cycle_tstamp: a value returned by tc->cc->read() + * + * Cycle counts that are converted correctly as long as they + * fall into the interval [-1/2 max cycle count, +1/2 max cycle count], + * with "max cycle count" == cs->mask+1. + * + * This allows conversion of cycle counter values which were generated + * in the past. + */ +extern u64 timecounter_cyc2time(struct timecounter *tc, + cycle_t cycle_tstamp); + +#endif diff --git a/include/linux/types.h b/include/linux/types.h index a0bb7048687f..62323825cff9 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -213,5 +213,8 @@ struct callback_head { }; #define rcu_head callback_head +/* clocksource cycle base type */ +typedef u64 cycle_t; + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_TYPES_H */ -- cgit v1.2.3 From 796c1efd6fa0ed696d550b68f4410ab1a1749d01 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Sun, 21 Dec 2014 19:46:57 +0100 Subject: timecounter: provide a helper function to shift the time. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some PTP Hardware Clock drivers use a struct timecounter to represent their clock. To adjust the time by a given offset, these drivers all perform a two step read/write of their timecounter. However, it is better and simpler just to adjust the offset in one step. This patch introduces a little routine to help drivers implement the adjtime method. Suggested-by: Janusz Użycki Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/timecounter.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h index 146f07a6651b..af3dfa4e90f0 100644 --- a/include/linux/timecounter.h +++ b/include/linux/timecounter.h @@ -78,6 +78,15 @@ static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, return ret; } +/** + * timecounter_adjtime - Shifts the time of the clock. + * @delta: Desired change in nanoseconds. + */ +static inline void timecounter_adjtime(struct timecounter *tc, s64 delta) +{ + tc->nsec += delta; +} + /** * timecounter_init - initialize a time counter * @tc: Pointer to time counter which is to be initialized/reset -- cgit v1.2.3 From 2eebdde6528a722fbf8e2cffcf7aa52cbb4c2de0 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Sun, 21 Dec 2014 19:47:06 +0100 Subject: timecounter: keep track of accumulated fractional nanoseconds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current timecounter implementation will drop a variable amount of resolution, depending on the magnitude of the time delta. In other words, reading the clock too often or too close to a time stamp conversion will introduce errors into the time values. This patch fixes the issue by introducing a fractional nanosecond field that accumulates the low order bits. Reported-by: Janusz Użycki Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/timecounter.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h index af3dfa4e90f0..74f45496e6d1 100644 --- a/include/linux/timecounter.h +++ b/include/linux/timecounter.h @@ -55,27 +55,32 @@ struct cyclecounter { * @cycle_last: most recent cycle counter value seen by * timecounter_read() * @nsec: continuously increasing count + * @mask: bit mask for maintaining the 'frac' field + * @frac: accumulated fractional nanoseconds */ struct timecounter { const struct cyclecounter *cc; cycle_t cycle_last; u64 nsec; + u64 mask; + u64 frac; }; /** * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds * @cc: Pointer to cycle counter. * @cycles: Cycles - * - * XXX - This could use some mult_lxl_ll() asm optimization. Same code - * as in cyc2ns, but with unsigned result. + * @mask: bit mask for maintaining the 'frac' field + * @frac: pointer to storage for the fractional nanoseconds. */ static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, - cycle_t cycles) + cycle_t cycles, u64 mask, u64 *frac) { - u64 ret = (u64)cycles; - ret = (ret * cc->mult) >> cc->shift; - return ret; + u64 ns = (u64) cycles; + + ns = (ns * cc->mult) + *frac; + *frac = ns & mask; + return ns >> cc->shift; } /** -- cgit v1.2.3 From dd450777990baae668c1143064f2f234dbab1b9b Mon Sep 17 00:00:00 2001 From: Dmitry Eremin-Solenikov Date: Wed, 24 Dec 2014 01:14:14 +0300 Subject: arm: sa1100: move irda header to linux/platform_data In the end asm/mach/irda.h header is not used by anybody except sa1100. Move the header to the platform data includes dir and rename it to irda-sa11x0.h. Signed-off-by: Dmitry Eremin-Solenikov Signed-off-by: David S. Miller --- include/linux/platform_data/irda-sa11x0.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 include/linux/platform_data/irda-sa11x0.h (limited to 'include/linux') diff --git a/include/linux/platform_data/irda-sa11x0.h b/include/linux/platform_data/irda-sa11x0.h new file mode 100644 index 000000000000..38f77b5e56cf --- /dev/null +++ b/include/linux/platform_data/irda-sa11x0.h @@ -0,0 +1,20 @@ +/* + * arch/arm/include/asm/mach/irda.h + * + * Copyright (C) 2004 Russell King. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __ASM_ARM_MACH_IRDA_H +#define __ASM_ARM_MACH_IRDA_H + +struct irda_platform_data { + int (*startup)(struct device *); + void (*shutdown)(struct device *); + int (*set_power)(struct device *, unsigned int state); + void (*set_speed)(struct device *, unsigned int speed); +}; + +#endif -- cgit v1.2.3 From de40ed31b3c577cefd7b54972365a272ecbe9dd6 Mon Sep 17 00:00:00 2001 From: Nimrod Andy Date: Wed, 24 Dec 2014 17:30:39 +0800 Subject: net: fec: add Wake-on-LAN support Support for Wake-on-LAN using Magic Packet. ENET IP supports sleep mode in low power status, when system enter suspend status, Magic packet can wake up system even if all SOC clocks are gate. The patch doing below things: - flagging the device as a wakeup source for the system, as well as its Wake-on-LAN interrupt - prepare the hardware for entering WoL mode - add standard ethtool WOL interface - enable the ENET interrupt to wake us Tested on i.MX6q/dl sabresd, sabreauto boards, i.MX6SX arm2 boards. Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- include/linux/fec.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fec.h b/include/linux/fec.h index bcff455d1d53..1454a503622d 100644 --- a/include/linux/fec.h +++ b/include/linux/fec.h @@ -19,6 +19,7 @@ struct fec_platform_data { phy_interface_t phy; unsigned char mac[ETH_ALEN]; + void (*sleep_mode_enable)(int enabled); }; #endif -- cgit v1.2.3 From 9b174d88c257150562b0101fcc6cb6c3cb74275c Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 30 Dec 2014 19:10:15 -0800 Subject: net: Add Transparent Ethernet Bridging GRO support. Currently the only tunnel protocol that supports GRO with encapsulated Ethernet is VXLAN. This pulls out the Ethernet code into a proper layer so that it can be used by other tunnel protocols such as GRE and Geneve. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 41c891d05f04..1d869d185a0d 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -52,6 +52,10 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1) #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count) +struct sk_buff **eth_gro_receive(struct sk_buff **head, + struct sk_buff *skb); +int eth_gro_complete(struct sk_buff *skb, int nhoff); + /* Reserved Ethernet Addresses per IEEE 802.1Q */ static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; -- cgit v1.2.3 From 1891172aa5c32f08ad9931b794edd71e91a4a527 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 2 Jan 2015 20:22:03 +0100 Subject: timecounter: provide a macro to initialize the cyclecounter mask field. There is no need for users of the timecounter/cyclecounter code to include clocksource.h just for a single macro. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/timecounter.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h index 74f45496e6d1..4382035a75bb 100644 --- a/include/linux/timecounter.h +++ b/include/linux/timecounter.h @@ -19,6 +19,9 @@ #include +/* simplify initialization of mask field */ +#define CYCLECOUNTER_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1) + /** * struct cyclecounter - hardware abstraction for a free running counter * Provides completely state-free accessors to the underlying hardware. @@ -29,7 +32,7 @@ * @read: returns the current cycle value * @mask: bitmask for two's complement * subtraction of non 64 bit counters, - * see CLOCKSOURCE_MASK() helper macro + * see CYCLECOUNTER_MASK() helper macro * @mult: cycle to nanosecond multiplier * @shift: cycle to nanosecond divisor (power of two) */ -- cgit v1.2.3 From 8d24c0b43125ec26cc80e04588477a9a2afc025c Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 2 Jan 2015 23:00:14 +0100 Subject: rhashtable: Do hashing inside of rhashtable_lookup_compare() Hash the key inside of rhashtable_lookup_compare() like rhashtable_lookup() does. This allows to simplify the hashing functions and keep them private. Signed-off-by: Thomas Graf Cc: netfilter-devel@vger.kernel.org Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index b93fd89b2e5e..1b51221c6bbd 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -96,9 +96,6 @@ static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht) int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params); -u32 rhashtable_hashfn(const struct rhashtable *ht, const void *key, u32 len); -u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr); - void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node); bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node); void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj, @@ -111,7 +108,7 @@ int rhashtable_expand(struct rhashtable *ht); int rhashtable_shrink(struct rhashtable *ht); void *rhashtable_lookup(const struct rhashtable *ht, const void *key); -void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash, +void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key, bool (*compare)(void *, void *), void *arg); void rhashtable_destroy(const struct rhashtable *ht); -- cgit v1.2.3 From 88d6ed15acff1cb44b1d1f3c0a393b7f7744957a Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 2 Jan 2015 23:00:16 +0100 Subject: rhashtable: Convert bucket iterators to take table and index This patch is in preparation to introduce per bucket spinlocks. It extends all iterator macros to take the bucket table and bucket index. It also introduces a new rht_dereference_bucket() to handle protected accesses to buckets. It introduces a barrier() to the RCU iterators to the prevent the compiler from caching the first element. The lockdep verifier is introduced as stub which always succeeds and properly implement in the next patch when the locks are introduced. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 173 ++++++++++++++++++++++++++++++--------------- 1 file changed, 116 insertions(+), 57 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 1b51221c6bbd..b54e24a08806 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -87,11 +87,18 @@ struct rhashtable { #ifdef CONFIG_PROVE_LOCKING int lockdep_rht_mutex_is_held(const struct rhashtable *ht); +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); #else static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht) { return 1; } + +static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, + u32 hash) +{ + return 1; +} #endif /* CONFIG_PROVE_LOCKING */ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params); @@ -119,92 +126,144 @@ void rhashtable_destroy(const struct rhashtable *ht); #define rht_dereference_rcu(p, ht) \ rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) -#define rht_entry(ptr, type, member) container_of(ptr, type, member) -#define rht_entry_safe(ptr, type, member) \ -({ \ - typeof(ptr) __ptr = (ptr); \ - __ptr ? rht_entry(__ptr, type, member) : NULL; \ -}) +#define rht_dereference_bucket(p, tbl, hash) \ + rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash)) -#define rht_next_entry_safe(pos, ht, member) \ -({ \ - pos ? rht_entry_safe(rht_dereference((pos)->member.next, ht), \ - typeof(*(pos)), member) : NULL; \ -}) +#define rht_dereference_bucket_rcu(p, tbl, hash) \ + rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash)) + +#define rht_entry(tpos, pos, member) \ + ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) /** - * rht_for_each - iterate over hash chain - * @pos: &struct rhash_head to use as a loop cursor. - * @head: head of the hash chain (struct rhash_head *) - * @ht: pointer to your struct rhashtable + * rht_for_each_continue - continue iterating over hash chain + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the previous &struct rhash_head to continue from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index */ -#define rht_for_each(pos, head, ht) \ - for (pos = rht_dereference(head, ht); \ +#define rht_for_each_continue(pos, head, tbl, hash) \ + for (pos = rht_dereference_bucket(head, tbl, hash); \ pos; \ - pos = rht_dereference((pos)->next, ht)) + pos = rht_dereference_bucket((pos)->next, tbl, hash)) + +/** + * rht_for_each - iterate over hash chain + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + */ +#define rht_for_each(pos, tbl, hash) \ + rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash) + +/** + * rht_for_each_entry_continue - continue iterating over hash chain + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the previous &struct rhash_head to continue from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + */ +#define rht_for_each_entry_continue(tpos, pos, head, tbl, hash, member) \ + for (pos = rht_dereference_bucket(head, tbl, hash); \ + pos && rht_entry(tpos, pos, member); \ + pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each_entry - iterate over hash chain of given type - * @pos: type * to use as a loop cursor. - * @head: head of the hash chain (struct rhash_head *) - * @ht: pointer to your struct rhashtable - * @member: name of the rhash_head within the hashable struct. + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. */ -#define rht_for_each_entry(pos, head, ht, member) \ - for (pos = rht_entry_safe(rht_dereference(head, ht), \ - typeof(*(pos)), member); \ - pos; \ - pos = rht_next_entry_safe(pos, ht, member)) +#define rht_for_each_entry(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_continue(tpos, pos, (tbl)->buckets[hash], \ + tbl, hash, member) /** * rht_for_each_entry_safe - safely iterate over hash chain of given type - * @pos: type * to use as a loop cursor. - * @n: type * to use for temporary next object storage - * @head: head of the hash chain (struct rhash_head *) - * @ht: pointer to your struct rhashtable - * @member: name of the rhash_head within the hashable struct. + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @next: the &struct rhash_head to use as next in loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive allows for the looped code to * remove the loop cursor from the list. */ -#define rht_for_each_entry_safe(pos, n, head, ht, member) \ - for (pos = rht_entry_safe(rht_dereference(head, ht), \ - typeof(*(pos)), member), \ - n = rht_next_entry_safe(pos, ht, member); \ - pos; \ - pos = n, \ - n = rht_next_entry_safe(pos, ht, member)) +#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ + for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \ + next = pos ? rht_dereference_bucket(pos->next, tbl, hash) \ + : NULL; \ + pos && rht_entry(tpos, pos, member); \ + pos = next) + +/** + * rht_for_each_rcu_continue - continue iterating over rcu hash chain + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the previous &struct rhash_head to continue from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_rcu_continue(pos, head, tbl, hash) \ + for (({barrier(); }), \ + pos = rht_dereference_bucket_rcu(head, tbl, hash); \ + pos; \ + pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_rcu - iterate over rcu hash chain - * @pos: &struct rhash_head to use as a loop cursor. - * @head: head of the hash chain (struct rhash_head *) - * @ht: pointer to your struct rhashtable + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with - * the _rcu fkht mutation primitives such as rht_insert() as long as the + * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ -#define rht_for_each_rcu(pos, head, ht) \ - for (pos = rht_dereference_rcu(head, ht); \ - pos; \ - pos = rht_dereference_rcu((pos)->next, ht)) +#define rht_for_each_rcu(pos, tbl, hash) \ + rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash) + +/** + * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the previous &struct rhash_head to continue from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \ + for (({barrier(); }), \ + pos = rht_dereference_bucket_rcu(head, tbl, hash); \ + pos && rht_entry(tpos, pos, member); \ + pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) /** * rht_for_each_entry_rcu - iterate over rcu hash chain of given type - * @pos: type * to use as a loop cursor. - * @head: head of the hash chain (struct rhash_head *) - * @member: name of the rhash_head within the hashable struct. + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with - * the _rcu fkht mutation primitives such as rht_insert() as long as the + * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ -#define rht_for_each_entry_rcu(pos, head, member) \ - for (pos = rht_entry_safe(rcu_dereference_raw(head), \ - typeof(*(pos)), member); \ - pos; \ - pos = rht_entry_safe(rcu_dereference_raw((pos)->member.next), \ - typeof(*(pos)), member)) +#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\ + tbl, hash, member) #endif /* _LINUX_RHASHTABLE_H */ -- cgit v1.2.3 From 897362e446436d245972e72c6bc5b33bd7a5c659 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 2 Jan 2015 23:00:18 +0100 Subject: nft_hash: Remove rhashtable_remove_pprev() The removal function of nft_hash currently stores a reference to the previous element during lookup which is used to optimize removal later on. This was possible because a lock is held throughout calling rhashtable_lookup() and rhashtable_remove(). With the introdution of deferred table resizing in parallel to lookups and insertions, the nftables lock will no longer synchronize all table mutations and the stored pprev may become invalid. Removing this optimization makes removal slightly more expensive on average but allows taking the resize cost out of the insert and remove path. Signed-off-by: Thomas Graf Cc: netfilter-devel@vger.kernel.org Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index b54e24a08806..f624d4b5045f 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -105,8 +105,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params); void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node); bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node); -void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj, - struct rhash_head __rcu **pprev); bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size); bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size); -- cgit v1.2.3 From 113948d841e8d78039e5dbbb5248f5b73e99eafa Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 2 Jan 2015 23:00:19 +0100 Subject: spinlock: Add spin_lock_bh_nested() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/spinlock.h | 8 ++++++++ include/linux/spinlock_api_smp.h | 2 ++ include/linux/spinlock_api_up.h | 1 + 3 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 262ba4ef9a8e..3e18379dfa6f 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -190,6 +190,8 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock_nested(lock, subclass) +# define raw_spin_lock_bh_nested(lock, subclass) \ + _raw_spin_lock_bh_nested(lock, subclass) # define raw_spin_lock_nest_lock(lock, nest_lock) \ do { \ @@ -205,6 +207,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock(((void)(subclass), (lock))) # define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock) +# define raw_spin_lock_bh_nested(lock, subclass) _raw_spin_lock_bh(lock) #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) @@ -324,6 +327,11 @@ do { \ raw_spin_lock_nested(spinlock_check(lock), subclass); \ } while (0) +#define spin_lock_bh_nested(lock, subclass) \ +do { \ + raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\ +} while (0) + #define spin_lock_nest_lock(lock, nest_lock) \ do { \ raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 42dfab89e740..5344268e6e62 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -22,6 +22,8 @@ int in_lock_functions(unsigned long addr); void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) __acquires(lock); +void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) + __acquires(lock); void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map) __acquires(lock); diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index d0d188861ad6..d3afef9d8dbe 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -57,6 +57,7 @@ #define _raw_spin_lock(lock) __LOCK(lock) #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) +#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock) #define _raw_read_lock(lock) __LOCK(lock) #define _raw_write_lock(lock) __LOCK(lock) #define _raw_spin_lock_bh(lock) __LOCK_BH(lock) -- cgit v1.2.3 From 97defe1ecf868b8127f8e62395499d6a06e4c4b1 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 2 Jan 2015 23:00:20 +0100 Subject: rhashtable: Per bucket locks & deferred expansion/shrinking Introduces an array of spinlocks to protect bucket mutations. The number of spinlocks per CPU is configurable and selected based on the hash of the bucket. This allows for parallel insertions and removals of entries which do not share a lock. The patch also defers expansion and shrinking to a worker queue which allows insertion and removal from atomic context. Insertions and deletions may occur in parallel to it and are only held up briefly while the particular bucket is linked or unzipped. Mutations of the bucket table pointer is protected by a new mutex, read access is RCU protected. In the event of an expansion or shrinking, the new bucket table allocated is exposed as a so called future table as soon as the resize process starts. Lookups, deletions, and insertions will briefly use both tables. The future table becomes the main table after an RCU grace period and initial linking of the old to the new table was performed. Optimization of the chains to make use of the new number of buckets follows only the new table is in use. The side effect of this is that during that RCU grace period, a bucket traversal using any rht_for_each() variant on the main table will not see any insertions performed during the RCU grace period which would at that point land in the future table. The lookup will see them as it searches both tables if needed. Having multiple insertions and removals occur in parallel requires nelems to become an atomic counter. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index f624d4b5045f..a1688f0a6193 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -19,6 +19,7 @@ #define _LINUX_RHASHTABLE_H #include +#include struct rhash_head { struct rhash_head __rcu *next; @@ -26,8 +27,17 @@ struct rhash_head { #define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL) +/** + * struct bucket_table - Table of hash buckets + * @size: Number of hash buckets + * @locks_mask: Mask to apply before accessing locks[] + * @locks: Array of spinlocks protecting individual buckets + * @buckets: size * hash buckets + */ struct bucket_table { size_t size; + unsigned int locks_mask; + spinlock_t *locks; struct rhash_head __rcu *buckets[]; }; @@ -45,11 +55,11 @@ struct rhashtable; * @hash_rnd: Seed to use while hashing * @max_shift: Maximum number of shifts while expanding * @min_shift: Minimum number of shifts while shrinking + * @locks_mul: Number of bucket locks to allocate per cpu (default: 128) * @hashfn: Function to hash key * @obj_hashfn: Function to hash object * @grow_decision: If defined, may return true if table should expand * @shrink_decision: If defined, may return true if table should shrink - * @mutex_is_held: Must return true if protecting mutex is held */ struct rhashtable_params { size_t nelem_hint; @@ -59,37 +69,42 @@ struct rhashtable_params { u32 hash_rnd; size_t max_shift; size_t min_shift; + size_t locks_mul; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; bool (*grow_decision)(const struct rhashtable *ht, size_t new_size); bool (*shrink_decision)(const struct rhashtable *ht, size_t new_size); -#ifdef CONFIG_PROVE_LOCKING - int (*mutex_is_held)(void *parent); - void *parent; -#endif }; /** * struct rhashtable - Hash table handle * @tbl: Bucket table + * @future_tbl: Table under construction during expansion/shrinking * @nelems: Number of elements in table * @shift: Current size (1 << shift) * @p: Configuration parameters + * @run_work: Deferred worker to expand/shrink asynchronously + * @mutex: Mutex to protect current/future table swapping + * @being_destroyed: True if table is set up for destruction */ struct rhashtable { struct bucket_table __rcu *tbl; - size_t nelems; + struct bucket_table __rcu *future_tbl; + atomic_t nelems; size_t shift; struct rhashtable_params p; + struct delayed_work run_work; + struct mutex mutex; + bool being_destroyed; }; #ifdef CONFIG_PROVE_LOCKING -int lockdep_rht_mutex_is_held(const struct rhashtable *ht); +int lockdep_rht_mutex_is_held(struct rhashtable *ht); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); #else -static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht) +static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht) { return 1; } @@ -112,11 +127,11 @@ bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size); int rhashtable_expand(struct rhashtable *ht); int rhashtable_shrink(struct rhashtable *ht); -void *rhashtable_lookup(const struct rhashtable *ht, const void *key); -void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key, +void *rhashtable_lookup(struct rhashtable *ht, const void *key); +void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key, bool (*compare)(void *, void *), void *arg); -void rhashtable_destroy(const struct rhashtable *ht); +void rhashtable_destroy(struct rhashtable *ht); #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) -- cgit v1.2.3 From f89bd6f87a53ce5a7d60662429591ebac2745c10 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 2 Jan 2015 23:00:21 +0100 Subject: rhashtable: Supports for nulls marker In order to allow for wider usage of rhashtable, use a special nulls marker to terminate each chain. The reason for not using the existing nulls_list is that the prev pointer usage would not be valid as entries can be linked in two different buckets at the same time. The 4 nulls base bits can be set through the rhashtable_params structure like this: struct rhashtable_params params = { [...] .nulls_base = (1U << RHT_BASE_SHIFT), }; This reduces the hash length from 32 bits to 27 bits. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/list_nulls.h | 3 ++- include/linux/rhashtable.h | 57 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index 5d10ae364b5e..e8c300e06438 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -21,8 +21,9 @@ struct hlist_nulls_head { struct hlist_nulls_node { struct hlist_nulls_node *next, **pprev; }; +#define NULLS_MARKER(value) (1UL | (((long)value) << 1)) #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ - ((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 1))) + ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) /** diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index a1688f0a6193..de7cac753b09 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -18,15 +18,32 @@ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H -#include +#include #include +/* + * The end of the chain is marked with a special nulls marks which has + * the following format: + * + * +-------+-----------------------------------------------------+-+ + * | Base | Hash |1| + * +-------+-----------------------------------------------------+-+ + * + * Base (4 bits) : Reserved to distinguish between multiple tables. + * Specified via &struct rhashtable_params.nulls_base. + * Hash (27 bits): Full hash (unmasked) of first element added to bucket + * 1 (1 bit) : Nulls marker (always set) + * + * The remaining bits of the next pointer remain unused for now. + */ +#define RHT_BASE_BITS 4 +#define RHT_HASH_BITS 27 +#define RHT_BASE_SHIFT RHT_HASH_BITS + struct rhash_head { struct rhash_head __rcu *next; }; -#define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL) - /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets @@ -55,6 +72,7 @@ struct rhashtable; * @hash_rnd: Seed to use while hashing * @max_shift: Maximum number of shifts while expanding * @min_shift: Minimum number of shifts while shrinking + * @nulls_base: Base value to generate nulls marker * @locks_mul: Number of bucket locks to allocate per cpu (default: 128) * @hashfn: Function to hash key * @obj_hashfn: Function to hash object @@ -69,6 +87,7 @@ struct rhashtable_params { u32 hash_rnd; size_t max_shift; size_t min_shift; + u32 nulls_base; size_t locks_mul; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; @@ -100,6 +119,24 @@ struct rhashtable { bool being_destroyed; }; +static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash) +{ + return NULLS_MARKER(ht->p.nulls_base + hash); +} + +#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \ + ((ptr) = (typeof(ptr)) rht_marker(ht, hash)) + +static inline bool rht_is_a_nulls(const struct rhash_head *ptr) +{ + return ((unsigned long) ptr & 1); +} + +static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr) +{ + return ((unsigned long) ptr) >> 1; +} + #ifdef CONFIG_PROVE_LOCKING int lockdep_rht_mutex_is_held(struct rhashtable *ht); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); @@ -157,7 +194,7 @@ void rhashtable_destroy(struct rhashtable *ht); */ #define rht_for_each_continue(pos, head, tbl, hash) \ for (pos = rht_dereference_bucket(head, tbl, hash); \ - pos; \ + !rht_is_a_nulls(pos); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** @@ -180,7 +217,7 @@ void rhashtable_destroy(struct rhashtable *ht); */ #define rht_for_each_entry_continue(tpos, pos, head, tbl, hash, member) \ for (pos = rht_dereference_bucket(head, tbl, hash); \ - pos && rht_entry(tpos, pos, member); \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** @@ -209,9 +246,9 @@ void rhashtable_destroy(struct rhashtable *ht); */ #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \ - next = pos ? rht_dereference_bucket(pos->next, tbl, hash) \ - : NULL; \ - pos && rht_entry(tpos, pos, member); \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = next) /** @@ -228,7 +265,7 @@ void rhashtable_destroy(struct rhashtable *ht); #define rht_for_each_rcu_continue(pos, head, tbl, hash) \ for (({barrier(); }), \ pos = rht_dereference_bucket_rcu(head, tbl, hash); \ - pos; \ + !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** @@ -260,7 +297,7 @@ void rhashtable_destroy(struct rhashtable *ht); #define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \ for (({barrier(); }), \ pos = rht_dereference_bucket_rcu(head, tbl, hash); \ - pos && rht_entry(tpos, pos, member); \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) /** -- cgit v1.2.3 From 86b35b64ed7b6b38305dee67a0f2ddff2ca5455d Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Sun, 4 Jan 2015 15:25:09 +0800 Subject: rhashtable: fix missing header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixup below build error: include/linux/rhashtable.h: At top level: include/linux/rhashtable.h:118:34: error: field ‘mutex’ has incomplete type Signed-off-by: Ying Xue Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index de7cac753b09..de1459c74c4d 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -20,6 +20,7 @@ #include #include +#include /* * The end of the chain is marked with a special nulls marks which has -- cgit v1.2.3 From a3449ded128d037e6b2bd7a59b0741de56506066 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Sun, 4 Jan 2015 15:24:35 +0800 Subject: list_nulls: fix missing header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixup below build error: include/linux/list_nulls.h: In function ‘hlist_nulls_del’: include/linux/list_nulls.h:84:13: error: ‘LIST_POISON2’ undeclared (first use in this function) Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- include/linux/list_nulls.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h index e8c300e06438..f266661d2666 100644 --- a/include/linux/list_nulls.h +++ b/include/linux/list_nulls.h @@ -1,6 +1,9 @@ #ifndef _LINUX_LIST_NULLS_H #define _LINUX_LIST_NULLS_H +#include +#include + /* * Special version of lists, where end of list is not a NULL pointer, * but a 'nulls' marker, which can have many different values. -- cgit v1.2.3 From 224d019c4fbba242041e9b25a926ba873b7da1e2 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 5 Jan 2015 13:56:14 -0800 Subject: ip: Move checksum convert defines to inet Move convert_csum from udp_sock to inet_sock. This allows the possibility that we can use convert checksum for different types of sockets and also allows convert checksum to be enabled from inet layer (what we'll want to do when enabling IP_CHECKSUM cmsg). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/udp.h | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index ee3277593222..247cfdcc4b08 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -49,11 +49,7 @@ struct udp_sock { unsigned int corkflag; /* Cork is required */ __u8 encap_type; /* Is this an Encapsulation socket? */ unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */ - no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ - convert_csum:1;/* On receive, convert checksum - * unnecessary to checksum complete - * if possible. - */ + no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */ /* * Following member retains the information to create a UDP header * when the socket is uncorked. @@ -102,16 +98,6 @@ static inline bool udp_get_no_check6_rx(struct sock *sk) return udp_sk(sk)->no_check6_rx; } -static inline void udp_set_convert_csum(struct sock *sk, bool val) -{ - udp_sk(sk)->convert_csum = val; -} - -static inline bool udp_get_convert_csum(struct sock *sk) -{ - return udp_sk(sk)->convert_csum; -} - #define udp_portaddr_for_each_entry(__sk, node, list) \ hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node) -- cgit v1.2.3 From 8b3a38daff6f50027039d6979b9eb026907508eb Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Tue, 23 Dec 2014 19:04:23 +0100 Subject: brcmfmac: Add support for bcm43340/1 wireless chipsets This patch adds support for the bcm43340 and bcm43341 wireless chipsets. These two chipsets are identical from wireless parts perspective. As such they use the same firmware image. Cc: Samuel Ortiz Cc: Rob Herring Signed-off-by: John Stultz [arend@broadcom.com: squash to single commit, remove 43341 chipid] Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Hante Meuleman Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- include/linux/mmc/sdio_ids.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 0f01fe065424..996807963716 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -24,13 +24,15 @@ * Vendors and devices. Sort key: vendor first, device next. */ #define SDIO_VENDOR_ID_BROADCOM 0x02d0 -#define SDIO_DEVICE_ID_BROADCOM_43143 43143 +#define SDIO_DEVICE_ID_BROADCOM_43143 0xa887 #define SDIO_DEVICE_ID_BROADCOM_43241 0x4324 #define SDIO_DEVICE_ID_BROADCOM_4329 0x4329 #define SDIO_DEVICE_ID_BROADCOM_4330 0x4330 #define SDIO_DEVICE_ID_BROADCOM_4334 0x4334 +#define SDIO_DEVICE_ID_BROADCOM_43340 0xa94c +#define SDIO_DEVICE_ID_BROADCOM_43341 0xa94d #define SDIO_DEVICE_ID_BROADCOM_4335_4339 0x4335 -#define SDIO_DEVICE_ID_BROADCOM_43362 43362 +#define SDIO_DEVICE_ID_BROADCOM_43362 0xa962 #define SDIO_DEVICE_ID_BROADCOM_4354 0x4354 #define SDIO_VENDOR_ID_INTEL 0x0089 -- cgit v1.2.3 From 2f4383667d57d1c719070db86b14277277752841 Mon Sep 17 00:00:00 2001 From: Ed Swierk Date: Fri, 2 Jan 2015 17:27:56 -0800 Subject: ethtool: Extend ethtool plugin module eeprom API to phylib This patch extends the ethtool plugin module eeprom API to support cards whose phy support is delegated to a separate driver. The handlers for ETHTOOL_GMODULEINFO and ETHTOOL_GMODULEEEPROM call the module_info and module_eeprom functions if the phy driver provides them; otherwise the handlers call the equivalent ethtool_ops functions provided by network drivers with built-in phy support. Signed-off-by: Ed Swierk Signed-off-by: David S. Miller --- include/linux/phy.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 22af8f8f5802..9c189a1fa3a2 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -565,6 +565,15 @@ struct phy_driver { void (*write_mmd_indirect)(struct phy_device *dev, int ptrad, int devnum, int regnum, u32 val); + /* Get the size and type of the eeprom contained within a plug-in + * module */ + int (*module_info)(struct phy_device *dev, + struct ethtool_modinfo *modinfo); + + /* Get the eeprom information from the plug-in module */ + int (*module_eeprom)(struct phy_device *dev, + struct ethtool_eeprom *ee, u8 *data); + struct device_driver driver; }; #define to_phy_driver(d) container_of(d, struct phy_driver, driver) -- cgit v1.2.3 From db30485408326a6f466a843b291b23535f63eda0 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 7 Jan 2015 13:41:54 +0800 Subject: rhashtable: involve rhashtable_lookup_insert routine Involve a new function called rhashtable_lookup_insert() which makes lookup and insertion atomic under bucket lock protection, helping us avoid to introduce an extra lock when we search and insert an object into hash table. Signed-off-by: Ying Xue Signed-off-by: Thomas Graf Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index de1459c74c4d..73c913f31574 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -168,6 +168,7 @@ int rhashtable_shrink(struct rhashtable *ht); void *rhashtable_lookup(struct rhashtable *ht, const void *key); void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key, bool (*compare)(void *, void *), void *arg); +bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj); void rhashtable_destroy(struct rhashtable *ht); -- cgit v1.2.3 From c0c09bfdc4150b3918526660768585cd477adf35 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 7 Jan 2015 13:41:56 +0800 Subject: rhashtable: avoid unnecessary wakeup for worker queue Move condition statements of verifying whether hash table size exceeds its maximum threshold or reaches its minimum threshold from resizing functions to resizing decision functions, avoiding unnecessary wakeup for worker queue thread. Signed-off-by: Ying Xue Cc: Thomas Graf Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 73c913f31574..326acd8c2e9f 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -113,7 +113,7 @@ struct rhashtable { struct bucket_table __rcu *tbl; struct bucket_table __rcu *future_tbl; atomic_t nelems; - size_t shift; + atomic_t shift; struct rhashtable_params p; struct delayed_work run_work; struct mutex mutex; -- cgit v1.2.3 From 7a868d1e9ab3c534c5ad44e3e5dc46753a1e5636 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 12 Jan 2015 14:52:22 +0800 Subject: rhashtable: involve rhashtable_lookup_compare_insert routine Introduce a new function called rhashtable_lookup_compare_insert() which is very similar to rhashtable_lookup_insert(). But the former makes use of users' given compare function to look for an object, and then inserts it into hash table if found. As the entire process of search and insertion is under protection of per bucket lock, this can help users to avoid the involvement of extra lock. Signed-off-by: Ying Xue Cc: Thomas Graf Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 326acd8c2e9f..7b9bd77ed684 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -168,7 +168,12 @@ int rhashtable_shrink(struct rhashtable *ht); void *rhashtable_lookup(struct rhashtable *ht, const void *key); void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key, bool (*compare)(void *, void *), void *arg); + bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj); +bool rhashtable_lookup_compare_insert(struct rhashtable *ht, + struct rhash_head *obj, + bool (*compare)(void *, void *), + void *arg); void rhashtable_destroy(struct rhashtable *ht); -- cgit v1.2.3 From 6f73d3b13dc5e16ae06025cd1b12a36b2857caa2 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Mon, 12 Jan 2015 14:52:24 +0800 Subject: rhashtable: add a note for grow and shrink decision functions As commit c0c09bfdc415 ("rhashtable: avoid unnecessary wakeup for worker queue") moves condition statements of verifying whether hash table size exceeds its maximum threshold or reaches its minimum threshold from resizing functions to resizing decision functions, we should add a note in rhashtable.h to indicate the implementation of what the grow and shrink decision function must enforce min/max shift, otherwise, it's failed to take min/max shift's set watermarks into effect. Signed-off-by: Ying Xue Cc: Thomas Graf Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 7b9bd77ed684..9570832ab07c 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -79,6 +79,10 @@ struct rhashtable; * @obj_hashfn: Function to hash object * @grow_decision: If defined, may return true if table should expand * @shrink_decision: If defined, may return true if table should shrink + * + * Note: when implementing the grow and shrink decision function, min/max + * shift must be enforced, otherwise, resizing watermarks they set may be + * useless. */ struct rhashtable_params { size_t nelem_hint; -- cgit v1.2.3 From df8a39defad46b83694ea6dd868d332976d62cc0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 13 Jan 2015 17:13:44 +0100 Subject: net: rename vlan_tx_* helpers since "tx" is misleading there The same macros are used for rx as well. So rename it. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 515a35e2a48a..bea465f24ebb 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -78,9 +78,9 @@ static inline bool is_vlan_dev(struct net_device *dev) return dev->priv_flags & IFF_802_1Q_VLAN; } -#define vlan_tx_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) -#define vlan_tx_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) -#define vlan_tx_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) +#define skb_vlan_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) +#define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) +#define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) /** * struct vlan_pcpu_stats - VLAN percpu rx/tx stats @@ -376,7 +376,7 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb) { skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); + skb_vlan_tag_get(skb)); if (likely(skb)) skb->vlan_tci = 0; return skb; @@ -393,7 +393,7 @@ static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb) */ static inline struct sk_buff *vlan_hwaccel_push_inside(struct sk_buff *skb) { - if (vlan_tx_tag_present(skb)) + if (skb_vlan_tag_present(skb)) skb = __vlan_hwaccel_push_inside(skb); return skb; } @@ -442,8 +442,8 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { - if (vlan_tx_tag_present(skb)) { - *vlan_tci = vlan_tx_tag_get(skb); + if (skb_vlan_tag_present(skb)) { + *vlan_tci = skb_vlan_tag_get(skb); return 0; } else { *vlan_tci = 0; @@ -480,7 +480,7 @@ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) { __be16 protocol = 0; - if (vlan_tx_tag_present(skb) || + if (skb_vlan_tag_present(skb) || skb->protocol != cpu_to_be16(ETH_P_8021Q)) protocol = skb->protocol; else { -- cgit v1.2.3 From a2b12f3c7ac1ea43ae646db74faf0b56c2bba563 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 12 Jan 2015 17:00:37 -0800 Subject: udp: pass udp_offload struct to UDP gro callbacks This patch introduces udp_offload_callbacks which has the same GRO functions (but not a GSO function) as offload_callbacks, except there is an argument to a udp_offload struct passed to gro_receive and gro_complete functions. This additional argument can be used to retrieve the per port structure of the encapsulation for use in gro processing (mostly by doing container_of on the structure). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 679e6e90aa4c..47921c291dd6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1969,7 +1969,7 @@ struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); struct sk_buff **(*gro_receive)(struct sk_buff **head, - struct sk_buff *skb); + struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); }; @@ -1979,10 +1979,21 @@ struct packet_offload { struct list_head list; }; +struct udp_offload; + +struct udp_offload_callbacks { + struct sk_buff **(*gro_receive)(struct sk_buff **head, + struct sk_buff *skb, + struct udp_offload *uoff); + int (*gro_complete)(struct sk_buff *skb, + int nhoff, + struct udp_offload *uoff); +}; + struct udp_offload { __be16 port; u8 ipproto; - struct offload_callbacks callbacks; + struct udp_offload_callbacks callbacks; }; /* often modified stats are per cpu, other are shared (netdev->stats) */ -- cgit v1.2.3 From 57699a40b4f2694d3ee63fd5e6465ec8f600b620 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Fri, 16 Jan 2015 11:13:09 +0800 Subject: rhashtable: Fix race in rhashtable_destroy() and use regular work_struct When we put our declared work task in the global workqueue with schedule_delayed_work(), its delay parameter is always zero. Therefore, we should define a regular work in rhashtable structure instead of a delayed work. By the way, we add a condition to check whether resizing functions are NULL before cancelling the work, avoiding to cancel an uninitialized work. Lastly, while we wait for all work items we submitted before to run to completion with cancel_delayed_work(), ht->mutex has been taken in rhashtable_destroy(). Moreover, cancel_delayed_work() doesn't return until all work items are accomplished, and when work items are scheduled, the work's function - rht_deferred_worker() will be called. However, as rht_deferred_worker() also needs to acquire the lock, deadlock might happen at the moment as the lock is already held before. So if the cancel work function is moved out of the lock covered scope, this will avoid the deadlock. Fixes: 97defe1 ("rhashtable: Per bucket locks & deferred expansion/shrinking") Signed-off-by: Ying Xue Cc: Thomas Graf Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 9570832ab07c..a2562ed53ea3 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -119,7 +119,7 @@ struct rhashtable { atomic_t nelems; atomic_t shift; struct rhashtable_params p; - struct delayed_work run_work; + struct work_struct run_work; struct mutex mutex; bool being_destroyed; }; -- cgit v1.2.3 From 3aeb66176ffa8fefd7a9f7d37bda1d8adcf469a1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 15 Jan 2015 23:49:37 +0100 Subject: net: replace br_fdb_external_learn_* calls with switchdev notifier events This patch benefits from newly introduced switchdev notifier and uses it to propagate fdb learn events from rocker driver to bridge. That avoids direct function calls and possible use by other listeners (ovs). Suggested-by: Thomas Graf Signed-off-by: Jiri Pirko Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 0a8ce762a47f..a57bca2ea97e 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -50,24 +50,6 @@ extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __use typedef int br_should_route_hook_t(struct sk_buff *skb); extern br_should_route_hook_t __rcu *br_should_route_hook; -#if IS_ENABLED(CONFIG_BRIDGE) -int br_fdb_external_learn_add(struct net_device *dev, - const unsigned char *addr, u16 vid); -int br_fdb_external_learn_del(struct net_device *dev, - const unsigned char *addr, u16 vid); -#else -static inline int br_fdb_external_learn_add(struct net_device *dev, - const unsigned char *addr, u16 vid) -{ - return 0; -} -static inline int br_fdb_external_learn_del(struct net_device *dev, - const unsigned char *addr, u16 vid) -{ - return 0; -} -#endif - #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING) int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list); -- cgit v1.2.3 From c5ed1df781cb544d4e4d189bb5b6ec7336d8888c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Mon, 19 Jan 2015 08:30:30 +0100 Subject: bcma: use standard bus scanning during early register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starting with kernel 3.19-rc1 early registration of bcma on MIPS is done a bit later, with memory allocator available. This allows us to simplify code by using standard bus scanning method. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/bcma/bcma_soc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bcma/bcma_soc.h b/include/linux/bcma/bcma_soc.h index f24d245f8394..1b5fc0c3b1b5 100644 --- a/include/linux/bcma/bcma_soc.h +++ b/include/linux/bcma/bcma_soc.h @@ -5,8 +5,6 @@ struct bcma_soc { struct bcma_bus bus; - struct bcma_device core_cc; - struct bcma_device core_mips; }; int __init bcma_host_soc_register(struct bcma_soc *soc); -- cgit v1.2.3 From 872bf2fb69d90e3619befee842fc26db39d8e475 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 25 Jan 2015 16:59:35 +0200 Subject: net/mlx4_core: Maintain a persistent memory for mlx4 device Maintain a persistent memory that should survive reset flow/PCI error. This comes as a preparation for coming series to support above flows. Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index f1e41b33462f..1069ce65e8b4 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -744,8 +744,15 @@ struct mlx4_vf_dev { u8 n_ports; }; -struct mlx4_dev { +struct mlx4_dev_persistent { struct pci_dev *pdev; + struct mlx4_dev *dev; + int nvfs[MLX4_MAX_PORTS + 1]; + int num_vfs; +}; + +struct mlx4_dev { + struct mlx4_dev_persistent *persist; unsigned long flags; unsigned long num_slaves; struct mlx4_caps caps; @@ -754,13 +761,11 @@ struct mlx4_dev { struct radix_tree_root qp_table_tree; u8 rev_id; char board_id[MLX4_BOARD_ID_LEN]; - int num_vfs; int numa_node; int oper_log_mgm_entry_size; u64 regid_promisc_array[MLX4_MAX_PORTS + 1]; u64 regid_allmulti_array[MLX4_MAX_PORTS + 1]; struct mlx4_vf_dev *dev_vfs; - int nvfs[MLX4_MAX_PORTS + 1]; }; struct mlx4_eqe { -- cgit v1.2.3 From dd0eefe3abbf47442db296bf68f27eb2860c1cdf Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 25 Jan 2015 16:59:36 +0200 Subject: net/mlx4_core: Set device configuration data to be persistent across reset When an HCA enters an internal error state, this is detected by the driver. The driver then should reset the HCA and restart the software stack. Keep ports information and some SRIOV configuration in a persistent area to have it valid across reset. Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 1069ce65e8b4..8c3837ac1a2d 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -749,6 +749,8 @@ struct mlx4_dev_persistent { struct mlx4_dev *dev; int nvfs[MLX4_MAX_PORTS + 1]; int num_vfs; + enum mlx4_port_type curr_port_type[MLX4_MAX_PORTS + 1]; + enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1]; }; struct mlx4_dev { -- cgit v1.2.3 From ad9a0bf08ffbf32b8f292c3bb78ca0f24bb8f6b2 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 25 Jan 2015 16:59:37 +0200 Subject: net/mlx4_core: Refactor the catas flow to work per device Using a WQ per device instead of a single global WQ, this allows independent reset handling per device even when SRIOV is used. This comes as a pre-patch for supporting chip reset for both native and SRIOV. Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 8c3837ac1a2d..da425d2f3708 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -751,6 +751,8 @@ struct mlx4_dev_persistent { int num_vfs; enum mlx4_port_type curr_port_type[MLX4_MAX_PORTS + 1]; enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1]; + struct work_struct catas_work; + struct workqueue_struct *catas_wq; }; struct mlx4_dev { -- cgit v1.2.3 From f6bc11e42646e661e699a5593cbd1e9dba7191d0 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 25 Jan 2015 16:59:38 +0200 Subject: net/mlx4_core: Enhance the catas flow to support device reset This includes: - resetting the chip when a fatal error is detected (the current code does not do this). - exposing the ability to enter error state from outside the catas code by calling its functionality. (E.g. FW Command timeout, AER error). - managing a persistent device state. This is needed to sync between reset flow cases. Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index da425d2f3708..7d5d317cb7a6 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -411,6 +411,11 @@ enum { MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK = 1 << 4, }; +enum { + MLX4_DEVICE_STATE_UP = 1 << 0, + MLX4_DEVICE_STATE_INTERNAL_ERROR = 1 << 1, +}; + #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \ MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK) @@ -753,6 +758,8 @@ struct mlx4_dev_persistent { enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1]; struct work_struct catas_work; struct workqueue_struct *catas_wq; + struct mutex device_state_mutex; /* protect HW state */ + u8 state; }; struct mlx4_dev { -- cgit v1.2.3 From f5aef5aa35063f2b45c3605871cd525d0cb7fb7a Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 25 Jan 2015 16:59:39 +0200 Subject: net/mlx4_core: Activate reset flow upon fatal command cases We activate reset flow upon command fatal errors, when the device enters an erroneous state, and must be reset. The cases below are assumed to be fatal: FW command timed-out, an error from FW on closing commands, pci is offline when posting/pending a command. In those cases we place the device into an error state: chip is reset, pending commands are awakened and completed immediately. Subsequent commands will return immediately. The return code in the above cases will depend on the command. Commands which free and close resources will return success (because the chip was reset, so callers may safely free their kernel resources). Other commands will return -EIO. Since the device's state was marked as error, the catas poller will detect this and restart the device's software stack (as is done when a FW internal error is directly detected). The device state is protected by a persistent mutex lives on its mlx4_dev, as such no need any more for the hcr_mutex which is removed. Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/cmd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index 64d25941b329..e7543844cc7a 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -279,6 +279,7 @@ int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_in int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state); int mlx4_config_dev_retrieval(struct mlx4_dev *dev, struct mlx4_config_dev_params *params); +void mlx4_cmd_wake_completions(struct mlx4_dev *dev); /* * mlx4_get_slave_default_vlan - * return true if VST ( default vlan) -- cgit v1.2.3 From c69453e294c9f16da977b68e658a8028b854c209 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 25 Jan 2015 16:59:40 +0200 Subject: net/mlx4_core: Manage interface state for Reset flow cases We need to manage interface state to sync between reset flow and some other relative cases such as remove_one. This has to be done to prevent certain races. For example in case software stack is down as a result of unload call, the remove_one should skip the unload phase. Implement the remove_one case, handling AER and other cases comes next. The interface can be up/down, upon remove_one, the state will include an extra bit indicating that the device is cleaned-up, forcing other tasks to finish before the final cleanup. Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7d5d317cb7a6..33f9ca71925c 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -416,6 +416,11 @@ enum { MLX4_DEVICE_STATE_INTERNAL_ERROR = 1 << 1, }; +enum { + MLX4_INTERFACE_STATE_UP = 1 << 0, + MLX4_INTERFACE_STATE_DELETION = 1 << 1, +}; + #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \ MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK) @@ -760,6 +765,8 @@ struct mlx4_dev_persistent { struct workqueue_struct *catas_wq; struct mutex device_state_mutex; /* protect HW state */ u8 state; + struct mutex interface_state_mutex; /* protect SW state */ + u8 interface_state; }; struct mlx4_dev { -- cgit v1.2.3 From 55ad359225b2232b9b8f04a0dfa169bd3a7d86d2 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 25 Jan 2015 16:59:42 +0200 Subject: net/mlx4_core: Enable device recovery flow with SRIOV In SRIOV, both the PF and the VF may attempt device recovery whenever they assume that the device is not functioning. When the PF driver resets the device, the VF should detect this and attempt to reinitialize itself. The VF must be able to reset itself under all circumstances, even if the PF is not responsive. The VF shall reset itself in the following cases: 1. Commands are not processed within reasonable time over the communication channel. This is done considering device state and the correct return code based on the command as was done in the native mode, done in the next patch. 2. The VF driver receives an internal error event reported by the PF on the communication channel. This occurs when the PF driver resets the device or when VF is out of sync with the PF. Add 'VF reset' capability, which allows the VF to reinitialize itself even when the PF is not responsive. As PF and VF may run their reset flow simulantanisly, there are several cases that are handled: - Prevent freeing VF resources upon FLR, when PF is in its unloading stage. - Prevent PF getting VF commands before it has finished initializing its resources. - Upon VF startup, check that comm-channel is online before sending commands to the PF and getting timed-out. Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/cmd.h | 2 ++ include/linux/mlx4/device.h | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index e7543844cc7a..c989442ffc6a 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -280,6 +280,7 @@ int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_stat int mlx4_config_dev_retrieval(struct mlx4_dev *dev, struct mlx4_config_dev_params *params); void mlx4_cmd_wake_completions(struct mlx4_dev *dev); +void mlx4_report_internal_err_comm_event(struct mlx4_dev *dev); /* * mlx4_get_slave_default_vlan - * return true if VST ( default vlan) @@ -289,5 +290,6 @@ bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave, u16 *vlan, u8 *qos); #define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8) +#define COMM_CHAN_EVENT_INTERNAL_ERR (1 << 17) #endif /* MLX4_CMD_H */ diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 33f9ca71925c..5ef54e145e4d 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -208,6 +208,10 @@ enum { MLX4_QUERY_FUNC_FLAGS_A0_RES_QP = 1LL << 1 }; +enum { + MLX4_VF_CAP_FLAG_RESET = 1 << 0 +}; + /* bit enums for an 8-bit flags field indicating special use * QPs which require special handling in qp_reserve_range. * Currently, this only includes QPs used by the ETH interface, @@ -545,6 +549,7 @@ struct mlx4_caps { u8 alloc_res_qp_mask; u32 dmfs_high_rate_qpn_base; u32 dmfs_high_rate_qpn_range; + u32 vf_caps; }; struct mlx4_buf_list { -- cgit v1.2.3 From c2943f14534bdc4230f4da6dcd4ea03c5d8c8162 Mon Sep 17 00:00:00 2001 From: Harout Hedeshian Date: Tue, 20 Jan 2015 10:06:05 -0700 Subject: net: ipv6: Add sysctl entry to disable MTU updates from RA The kernel forcefully applies MTU values received in router advertisements provided the new MTU is less than the current. This behavior is undesirable when the user space is managing the MTU. Instead a sysctl flag 'accept_ra_mtu' is introduced such that the user space can control whether or not RA provided MTU updates should be applied. The default behavior is unchanged; user space must explicitly set this flag to 0 for RA MTUs to be ignored. Signed-off-by: Harout Hedeshian Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index c694e7baa621..2805062c013f 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -52,6 +52,7 @@ struct ipv6_devconf { __s32 force_tllao; __s32 ndisc_notify; __s32 suppress_frag_ndisc; + __s32 accept_ra_mtu; void *sysctl; }; -- cgit v1.2.3 From 607954b084d4ad5e6a2e0f795de7803d9c6ae37f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 21 Jan 2015 11:12:13 +0000 Subject: rhashtable: fix rht_for_each_entry_safe() endless loop "next" is not updated, causing an endless loop for buckets with more than one element. Fixes: 88d6ed15acff ("rhashtable: Convert bucket iterators to take table and index") Signed-off-by: Patrick McHardy Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index a2562ed53ea3..e0337844358e 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -260,7 +260,9 @@ void rhashtable_destroy(struct rhashtable *ht); next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ - pos = next) + pos = next, \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** * rht_for_each_rcu_continue - continue iterating over rcu hash chain -- cgit v1.2.3 From 0b35fa7daefe9da6fdef91d95e07eebb714a8fcc Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Sun, 25 Jan 2015 23:33:30 +0100 Subject: NFC: st21nfcb: Remove useless include include/linux/platform_data/st21nfcb.h is phy generic. There is no need to include linux/i2c.h Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- include/linux/platform_data/st21nfcb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/st21nfcb.h b/include/linux/platform_data/st21nfcb.h index c3b432f5b63e..05c54c958cc3 100644 --- a/include/linux/platform_data/st21nfcb.h +++ b/include/linux/platform_data/st21nfcb.h @@ -19,8 +19,6 @@ #ifndef _ST21NFCB_NCI_H_ #define _ST21NFCB_NCI_H_ -#include - #define ST21NFCB_NCI_DRIVER_NAME "st21nfcb_nci" struct st21nfcb_nfc_platform_data { -- cgit v1.2.3 From 6da7c85c75eed769423b428eb654eaaf89d273c1 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Sun, 25 Jan 2015 23:33:31 +0100 Subject: NFC: st21nfcb: Fix copy/paste error in comment include/linux/platform_data/st21nfcb.h is based on include/linux/platform_data/st21nfca.h. The endif comment is inacurrate for st21nfcb. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- include/linux/platform_data/st21nfcb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/st21nfcb.h b/include/linux/platform_data/st21nfcb.h index 05c54c958cc3..b023373d9874 100644 --- a/include/linux/platform_data/st21nfcb.h +++ b/include/linux/platform_data/st21nfcb.h @@ -26,4 +26,4 @@ struct st21nfcb_nfc_platform_data { unsigned int irq_polarity; }; -#endif /* _ST21NFCA_HCI_H_ */ +#endif /* _ST21NFCB_NCI_H_ */ -- cgit v1.2.3 From aae88261abd58fffef7ee0e00160ce4ea105b0f3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 26 Jan 2015 22:05:38 -0800 Subject: net: phy: document has_fixups field has_fixups was introduced to help keeping track of fixups/quirks running on a PHY device, but we did not update the comment above struct phy_device accordingly. Fixes: b0ae009f3dc14 (net: phy: add "has_fixups" boolean property") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 9c189a1fa3a2..1b3690b597d5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -327,6 +327,7 @@ struct phy_c45_device_ids { * c45_ids: 802.3-c45 Device Identifers if is_c45. * is_c45: Set to true if this phy uses clause 45 addressing. * is_internal: Set to true if this phy is internal to a MAC. + * has_fixups: Set to true if this phy has fixups/quirks. * state: state of the PHY for management purposes * dev_flags: Device-specific flags used by the PHY driver. * addr: Bus address of PHY -- cgit v1.2.3 From 8a477a6fb6a33651adda772360b85fd813569743 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 26 Jan 2015 22:05:39 -0800 Subject: net: phy: keep track of the PHY suspend state In order to avoid double calls to phydev->drv->suspend and resume, keep track of whether the PHY has already been suspended as a consequence of a successful call to phy_suspend(). We will use this in our MDIO bus suspend/resume hooks to avoid a double suspend call. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 1b3690b597d5..685809835b5c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -328,6 +328,7 @@ struct phy_c45_device_ids { * is_c45: Set to true if this phy uses clause 45 addressing. * is_internal: Set to true if this phy is internal to a MAC. * has_fixups: Set to true if this phy has fixups/quirks. + * suspended: Set to true if this phy has been suspended successfully. * state: state of the PHY for management purposes * dev_flags: Device-specific flags used by the PHY driver. * addr: Bus address of PHY @@ -365,6 +366,7 @@ struct phy_device { bool is_c45; bool is_internal; bool has_fixups; + bool suspended; enum phy_state state; -- cgit v1.2.3 From cfcf1682c4ca8f601a4702255958e0b1c9aa12cc Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sat, 24 Jan 2015 19:52:05 +0200 Subject: cfg80211: Add new GCMP, CCMP-256, BIP-GMAC, BIP-CMAC-256 ciphers This makes cfg80211 aware of the GCMP, GCMP-256, CCMP-256, BIP-GMAC-128, BIP-GMAC-256, and BIP-CMAC-256 cipher suites. These new cipher suites were defined in IEEE Std 802.11ac-2013. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 4f4eea8a6288..dbf417bf25bf 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1994,9 +1994,15 @@ enum ieee80211_key_len { WLAN_KEY_LEN_WEP40 = 5, WLAN_KEY_LEN_WEP104 = 13, WLAN_KEY_LEN_CCMP = 16, + WLAN_KEY_LEN_CCMP_256 = 32, WLAN_KEY_LEN_TKIP = 32, WLAN_KEY_LEN_AES_CMAC = 16, WLAN_KEY_LEN_SMS4 = 32, + WLAN_KEY_LEN_GCMP = 16, + WLAN_KEY_LEN_GCMP_256 = 32, + WLAN_KEY_LEN_BIP_CMAC_256 = 32, + WLAN_KEY_LEN_BIP_GMAC_128 = 16, + WLAN_KEY_LEN_BIP_GMAC_256 = 32, }; #define IEEE80211_WEP_IV_LEN 4 @@ -2004,9 +2010,16 @@ enum ieee80211_key_len { #define IEEE80211_CCMP_HDR_LEN 8 #define IEEE80211_CCMP_MIC_LEN 8 #define IEEE80211_CCMP_PN_LEN 6 +#define IEEE80211_CCMP_256_HDR_LEN 8 +#define IEEE80211_CCMP_256_MIC_LEN 16 +#define IEEE80211_CCMP_256_PN_LEN 6 #define IEEE80211_TKIP_IV_LEN 8 #define IEEE80211_TKIP_ICV_LEN 4 #define IEEE80211_CMAC_PN_LEN 6 +#define IEEE80211_GMAC_PN_LEN 6 +#define IEEE80211_GCMP_HDR_LEN 8 +#define IEEE80211_GCMP_MIC_LEN 16 +#define IEEE80211_GCMP_PN_LEN 6 /* Public action codes */ enum ieee80211_pub_actioncode { @@ -2230,6 +2243,11 @@ enum ieee80211_sa_query_action { #define WLAN_CIPHER_SUITE_WEP104 0x000FAC05 #define WLAN_CIPHER_SUITE_AES_CMAC 0x000FAC06 #define WLAN_CIPHER_SUITE_GCMP 0x000FAC08 +#define WLAN_CIPHER_SUITE_GCMP_256 0x000FAC09 +#define WLAN_CIPHER_SUITE_CCMP_256 0x000FAC0A +#define WLAN_CIPHER_SUITE_BIP_GMAC_128 0x000FAC0B +#define WLAN_CIPHER_SUITE_BIP_GMAC_256 0x000FAC0C +#define WLAN_CIPHER_SUITE_BIP_CMAC_256 0x000FAC0D #define WLAN_CIPHER_SUITE_SMS4 0x00147201 -- cgit v1.2.3 From 56c52da2d554f081e8fce58ecbcf6a40c605b95b Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sat, 24 Jan 2015 19:52:08 +0200 Subject: mac80111: Add BIP-CMAC-256 cipher This allows mac80211 to configure BIP-CMAC-256 to the driver and also use software-implementation within mac80211 when the driver does not support this with hardware accelaration. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index dbf417bf25bf..b9c7897dc566 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1017,6 +1017,15 @@ struct ieee80211_mmie { u8 mic[8]; } __packed; +/* Management MIC information element (IEEE 802.11w) for GMAC and CMAC-256 */ +struct ieee80211_mmie_16 { + u8 element_id; + u8 length; + __le16 key_id; + u8 sequence_number[6]; + u8 mic[16]; +} __packed; + struct ieee80211_vendor_ie { u8 element_id; u8 len; -- cgit v1.2.3 From 2130fb97fecf9a51bb4a21da220cff3f72496a94 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Tue, 27 Jan 2015 01:18:19 +0100 Subject: NFC: st21nfca: Adding support for secure element st21nfca has 1 physical SWP line and can support up to 2 secure elements (UICC & eSE) thanks to an external switch managed with a gpio. The platform integrator needs to specify thanks to 2 initialization properties, uicc-present and ese-present, if it is suppose to have uicc and/or ese. Of course if the platform does not have an external switch, only one kind of secure element can be supported. Those parameters are under platform integrator responsibilities. During initialization, the white_list will be set according to those parameters. The discovery_se function will assume a secure element is physically present according to uicc-present and ese-present values and will add it to the secure element list. On ese activation, the atr is retrieved to calculate a command exchange timeout based on the first atr(TB) value. The se_io will allow to transfer data over SWP. 2 kind of events may appear after a data is sent over: - ST21NFCA_EVT_TRANSMIT_DATA when receiving an apdu answer - ST21NFCA_EVT_WTX_REQUEST when the secure element needs more time than expected to compute a command. If this timeout expired, a first recovery tentative consist to send a simple software reset proprietary command. If this tentative still fail, a second recovery tentative consist to send a hardware reset proprietary command. This function is only relevant for eSE like secure element. This patch also change the way a pipe is referenced. There can be different pipe connected to the same gate with different host destination (ex: CONNECTIVITY). In order to keep host information every pipe are reference with a tuple (gate, host). In order to reduce changes, we are keeping unchanged the way a gate is addressed on the Terminal Host. However, this is working because we consider the apdu reader gate is only present on the eSE slot also the connectivity gate cannot give a reliable value; it will give the latest stored pipe value. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- include/linux/platform_data/st21nfca.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/st21nfca.h b/include/linux/platform_data/st21nfca.h index 5087fff96d86..cc2bdafb0c69 100644 --- a/include/linux/platform_data/st21nfca.h +++ b/include/linux/platform_data/st21nfca.h @@ -26,6 +26,8 @@ struct st21nfca_nfc_platform_data { unsigned int gpio_ena; unsigned int irq_polarity; + bool is_ese_present; + bool is_uicc_present; }; #endif /* _ST21NFCA_HCI_H_ */ -- cgit v1.2.3 From be6a6b43b597a37d96dbf74985f72045ccef0940 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 27 Jan 2015 15:57:59 +0200 Subject: net/mlx4_core: Add bad-cable event support If the firmware can detect a bad cable, allow it to generate an event, and print the problem in the log. Signed-off-by: Jack Morgenstein Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 5ef54e145e4d..c95d659a39f2 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -200,7 +200,8 @@ enum { MLX4_DEV_CAP_FLAG2_CONFIG_DEV = 1LL << 16, MLX4_DEV_CAP_FLAG2_SYS_EQS = 1LL << 17, MLX4_DEV_CAP_FLAG2_80_VFS = 1LL << 18, - MLX4_DEV_CAP_FLAG2_FS_A0 = 1LL << 19 + MLX4_DEV_CAP_FLAG2_FS_A0 = 1LL << 19, + MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20 }; enum { @@ -280,6 +281,7 @@ enum mlx4_event { MLX4_EVENT_TYPE_FATAL_WARNING = 0x1b, MLX4_EVENT_TYPE_FLR_EVENT = 0x1c, MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT = 0x1d, + MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT = 0x3e, MLX4_EVENT_TYPE_NONE = 0xff, }; @@ -288,6 +290,11 @@ enum { MLX4_PORT_CHANGE_SUBTYPE_ACTIVE = 4 }; +enum { + MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_BAD_CABLE = 1, + MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_UNSUPPORTED_CABLE = 2, +}; + enum { MLX4_FATAL_WARNING_SUBTYPE_WARMING = 0, }; @@ -860,6 +867,11 @@ struct mlx4_eqe { } __packed tbl_change_info; } params; } __packed port_mgmt_change; + struct { + u8 reserved[3]; + u8 port; + u32 reserved1[5]; + } __packed bad_cable; } event; u8 slave_id; u8 reserved3[2]; -- cgit v1.2.3 From 5a03108689c6f3e448a920b42af04e6d28401f80 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 27 Jan 2015 15:58:02 +0200 Subject: net/mlx4_core: Adjust command timeouts to conform to the firmware spec The firmware spec states that the timeout for all commands should be 60 seconds. In the past, the spec indicated that there were several classes of timeout (short, medium, and long). The driver has these different timeout classes. We leave the class differentiation in the driver as-is (to protect against any future spec changes), but set the timeout for all classes to be 60 seconds. In addition, we fix a few commands which had hard-coded numeric timeouts specified. Signed-off-by: Jack Morgenstein Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/mlx4/cmd.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index c989442ffc6a..ae95adc78509 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -165,9 +165,9 @@ enum { }; enum { - MLX4_CMD_TIME_CLASS_A = 10000, - MLX4_CMD_TIME_CLASS_B = 10000, - MLX4_CMD_TIME_CLASS_C = 10000, + MLX4_CMD_TIME_CLASS_A = 60000, + MLX4_CMD_TIME_CLASS_B = 60000, + MLX4_CMD_TIME_CLASS_C = 60000, }; enum { -- cgit v1.2.3 From 3c313161353ad527de1a6ecde0f0d41700858fd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Sat, 24 Jan 2015 18:47:20 +0100 Subject: bcma: detect SPROM revision 11 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extracting values from it is still unsupported, but at least we'll display some meaningful error now. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/ssb/ssb_regs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ssb/ssb_regs.h b/include/linux/ssb/ssb_regs.h index f7b9100686c3..c0f707ac192b 100644 --- a/include/linux/ssb/ssb_regs.h +++ b/include/linux/ssb/ssb_regs.h @@ -173,6 +173,7 @@ #define SSB_SPROMSIZE_BYTES_R123 (SSB_SPROMSIZE_WORDS_R123 * sizeof(u16)) #define SSB_SPROMSIZE_BYTES_R4 (SSB_SPROMSIZE_WORDS_R4 * sizeof(u16)) #define SSB_SPROMSIZE_WORDS_R10 230 +#define SSB_SPROMSIZE_WORDS_R11 234 #define SSB_SPROM_BASE1 0x1000 #define SSB_SPROM_BASE31 0x0800 #define SSB_SPROM_REVISION 0x007E -- cgit v1.2.3 From b504075f5903b969a54ef3a6ae994c0872edb259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Sun, 25 Jan 2015 11:11:14 +0100 Subject: bcma: add early_init function for PCIe core and move some fix into it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are some PCIe core fixes that need to be applied before accessing SPROM, otherwise reading it may fail. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/bcma/bcma_driver_pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h index 0333e605ea0d..3f809ae372c4 100644 --- a/include/linux/bcma/bcma_driver_pci.h +++ b/include/linux/bcma/bcma_driver_pci.h @@ -223,6 +223,7 @@ struct bcma_drv_pci_host { struct bcma_drv_pci { struct bcma_device *core; + u8 early_setup_done:1; u8 setup_done:1; u8 hostmode:1; @@ -237,6 +238,7 @@ struct bcma_drv_pci { #define pcicore_write16(pc, offset, val) bcma_write16((pc)->core, offset, val) #define pcicore_write32(pc, offset, val) bcma_write32((pc)->core, offset, val) +extern void bcma_core_pci_early_init(struct bcma_drv_pci *pc); extern void bcma_core_pci_init(struct bcma_drv_pci *pc); extern int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc, struct bcma_device *core, bool enable); -- cgit v1.2.3 From 8be08a39d498d5d93ff5149276e34ccb4ec3757f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Sun, 25 Jan 2015 13:41:19 +0100 Subject: bcma: implement host code support for PCIe Gen 2 devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is stil incomplete, so we don't add PCI IDs of new devices yet. Purpose of this patch is to allow testing & adjusting rest of the code. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/bcma/bcma.h | 1 + include/linux/bcma/bcma_regs.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index eb1c6a47b67f..994739da827f 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -318,6 +318,7 @@ struct bcma_bus { const struct bcma_host_ops *ops; enum bcma_hosttype hosttype; + bool host_is_pcie2; /* Used for BCMA_HOSTTYPE_PCI only */ union { /* Pointer to the PCI bus (only for BCMA_HOSTTYPE_PCI) */ struct pci_dev *host_pci; diff --git a/include/linux/bcma/bcma_regs.h b/include/linux/bcma/bcma_regs.h index e64ae7bf80a1..ebd5c1fcdea4 100644 --- a/include/linux/bcma/bcma_regs.h +++ b/include/linux/bcma/bcma_regs.h @@ -64,6 +64,8 @@ #define BCMA_PCI_GPIO_XTAL 0x40 /* PCI config space GPIO 14 for Xtal powerup */ #define BCMA_PCI_GPIO_PLL 0x80 /* PCI config space GPIO 15 for PLL powerdown */ +#define BCMA_PCIE2_BAR0_WIN2 0x70 + /* SiliconBackplane Address Map. * All regions may not exist on all chips. */ -- cgit v1.2.3 From 7866a621043fbaca3d7389e9b9f69dd1a2e5a855 Mon Sep 17 00:00:00 2001 From: Salam Noureddine Date: Tue, 27 Jan 2015 11:35:48 -0800 Subject: dev: add per net_device packet type chains When many pf_packet listeners are created on a lot of interfaces the current implementation using global packet type lists scales poorly. This patch adds per net_device packet type lists to fix this problem. The patch was originally written by Eric Biederman for linux-2.6.29. Tested on linux-3.16. Signed-off-by: "Eric W. Biederman" Signed-off-by: Salam Noureddine Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 642d426a668f..3d37c6eb1732 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1514,6 +1514,8 @@ struct net_device { struct list_head napi_list; struct list_head unreg_list; struct list_head close_list; + struct list_head ptype_all; + struct list_head ptype_specific; struct { struct list_head upper; -- cgit v1.2.3 From aafb3e98b27977148c8c86499684f8f5c3decfbb Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 29 Jan 2015 22:40:11 -0800 Subject: netdev: introduce new NETIF_F_HW_SWITCH_OFFLOAD feature flag for switch device offloads This is a high level feature flag for all switch asic offloads switch drivers set this flag on switch ports. Logical devices like bridge, bonds, vxlans can inherit this flag from their slaves/ports. The patch also adds the flag to NETIF_F_ONE_FOR_ALL, so that it gets propagated to the upperdevices (bridges and bonds). Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 8e30685affeb..7d59dc6ab789 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -66,6 +66,7 @@ enum { NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */ NETIF_F_HW_L2FW_DOFFLOAD_BIT, /* Allow L2 Forwarding in Hardware */ NETIF_F_BUSY_POLL_BIT, /* Busy poll */ + NETIF_F_HW_SWITCH_OFFLOAD_BIT, /* HW switch offload */ /* * Add your fresh new feature above and remember to update @@ -124,6 +125,7 @@ enum { #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) #define NETIF_F_HW_L2FW_DOFFLOAD __NETIF_F(HW_L2FW_DOFFLOAD) #define NETIF_F_BUSY_POLL __NETIF_F(BUSY_POLL) +#define NETIF_F_HW_SWITCH_OFFLOAD __NETIF_F(HW_SWITCH_OFFLOAD) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ @@ -159,7 +161,9 @@ enum { */ #define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \ NETIF_F_SG | NETIF_F_HIGHDMA | \ - NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED) + NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED | \ + NETIF_F_HW_SWITCH_OFFLOAD) + /* * If one device doesn't support one of these features, then disable it * for all in netdev_increment_features. -- cgit v1.2.3 From add511b38266aa10c1079f9248854e6a415c4dc2 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 29 Jan 2015 22:40:12 -0800 Subject: bridge: add flags argument to ndo_bridge_setlink and ndo_bridge_dellink bridge flags are needed inside ndo_bridge_setlink/dellink handlers to avoid another call to parse IFLA_AF_SPEC inside these handlers This is used later in this series Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3d37c6eb1732..16251e96e6aa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1154,13 +1154,15 @@ struct net_device_ops { int idx); int (*ndo_bridge_setlink)(struct net_device *dev, - struct nlmsghdr *nlh); + struct nlmsghdr *nlh, + u16 flags); int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask); int (*ndo_bridge_dellink)(struct net_device *dev, - struct nlmsghdr *nlh); + struct nlmsghdr *nlh, + u16 flags); int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); int (*ndo_get_phys_port_id)(struct net_device *dev, -- cgit v1.2.3 From 366e41d9774d7010cb63112b6db2fce6dc7809c0 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 31 Jan 2015 10:40:13 -0500 Subject: ipv6: pull cork initialization into its own function. Pull IPv6 cork initialization into its own function that can be re-used. IPv6 specific cork data did not have an explicit data structure. This patch creats eone so that just ipv6 cork data can be as arguemts. Also, since IPv6 tries to save the flow label into inet_cork_full tructure, pass the full cork. Adjust ip6_cork_release() to take cork data structures. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- include/linux/ipv6.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 2805062c013f..4d5169f5d7d1 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -125,6 +125,12 @@ struct ipv6_mc_socklist; struct ipv6_ac_socklist; struct ipv6_fl_socklist; +struct inet6_cork { + struct ipv6_txoptions *opt; + u8 hop_limit; + u8 tclass; +}; + /** * struct ipv6_pinfo - ipv6 private area * @@ -217,11 +223,7 @@ struct ipv6_pinfo { struct ipv6_txoptions *opt; struct sk_buff *pktoptions; struct sk_buff *rxpmtu; - struct { - struct ipv6_txoptions *opt; - u8 hop_limit; - u8 tclass; - } cork; + struct inet6_cork cork; }; /* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */ -- cgit v1.2.3 From 4c946d9c11d173c2ea6b9081b248f8072e6b46f1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 27 Nov 2014 19:52:04 -0500 Subject: vmci: propagate msghdr all way down to __qp_memcpy_to_queue() Switch from passing msg->iov_iter.iov to passing msg itself Signed-off-by: Al Viro --- include/linux/vmw_vmci_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmw_vmci_api.h b/include/linux/vmw_vmci_api.h index 5691f752ce8f..63df3a2a8ce5 100644 --- a/include/linux/vmw_vmci_api.h +++ b/include/linux/vmw_vmci_api.h @@ -74,7 +74,7 @@ ssize_t vmci_qpair_dequeue(struct vmci_qp *qpair, ssize_t vmci_qpair_peek(struct vmci_qp *qpair, void *buf, size_t buf_size, int mode); ssize_t vmci_qpair_enquev(struct vmci_qp *qpair, - void *iov, size_t iov_size, int mode); + struct msghdr *msg, size_t iov_size, int mode); ssize_t vmci_qpair_dequev(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size, int mode); ssize_t vmci_qpair_peekv(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size, -- cgit v1.2.3 From af2b040e470b470bfc881981db3c796072853eae Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 27 Nov 2014 21:44:24 -0500 Subject: rxrpc: switch rxrpc_send_data() to iov_iter primitives Convert skb_add_data() to iov_iter; allows to get rid of the explicit messing with iovec in its only caller - skb_add_data() will keep advancing ->msg_iter for us, so there's no need to similate that manually. Signed-off-by: Al Viro --- include/linux/skbuff.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 85ab7d72b54c..9a8bafee1b67 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2484,19 +2484,18 @@ static inline int skb_put_padto(struct sk_buff *skb, unsigned int len) } static inline int skb_add_data(struct sk_buff *skb, - char __user *from, int copy) + struct iov_iter *from, int copy) { const int off = skb->len; if (skb->ip_summed == CHECKSUM_NONE) { - int err = 0; - __wsum csum = csum_and_copy_from_user(from, skb_put(skb, copy), - copy, 0, &err); - if (!err) { + __wsum csum = 0; + if (csum_and_copy_from_iter(skb_put(skb, copy), copy, + &csum, from) == copy) { skb->csum = csum_block_add(skb->csum, csum, off); return 0; } - } else if (!copy_from_user(skb_put(skb, copy), from, copy)) + } else if (copy_from_iter(skb_put(skb, copy), copy, from) == copy) return 0; __skb_trim(skb, off); -- cgit v1.2.3 From 21226abb4e9f14d88238964d89b279e461ddc30c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 28 Nov 2014 15:48:29 -0500 Subject: net: switch memcpy_fromiovec()/memcpy_fromiovecend() users to copy_from_iter() That takes care of the majority of ->sendmsg() instances - most of them via memcpy_to_msg() or assorted getfrag() callbacks. One place where we still keep memcpy_fromiovecend() is tipc - there we potentially read the same data over and over; separate patch, that... Signed-off-by: Al Viro --- include/linux/skbuff.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9a8bafee1b67..b349c96dc80a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2692,8 +2692,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) { - /* XXX: stripping const */ - return memcpy_fromiovec(data, (struct iovec *)msg->msg_iter.iov, len); + return copy_from_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT; } static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) -- cgit v1.2.3 From 31a25fae85956e3a9c778141d29e5e803fb0b124 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 28 Nov 2014 15:53:57 -0500 Subject: net: bury net/core/iovec.c - nothing in there is used anymore Signed-off-by: Al Viro --- include/linux/socket.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 6e49a14365dc..5c19cba34dce 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -318,13 +318,6 @@ struct ucred { /* IPX options */ #define IPX_TYPE 1 -extern int csum_partial_copy_fromiovecend(unsigned char *kdata, - struct iovec *iov, - int offset, - unsigned int len, __wsum *csump); -extern unsigned long iov_pages(const struct iovec *iov, int offset, - unsigned long nr_segs); - extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); -- cgit v1.2.3 From aad9a1cec7dcd1d45809b64643fce37061b17788 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2014 14:49:01 -0500 Subject: vhost: switch vhost get_indirect() to iov_iter, kill memcpy_fromiovec() Cc: Michael S. Tsirkin Cc: kvm@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Al Viro --- include/linux/uio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 1c5e453f7ea9..af3439f4ebf2 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -135,7 +135,6 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); -int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len); int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, int offset, int len); int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata, -- cgit v1.2.3 From ba7438aed924133df54a60e4cd5499d359bcf2a8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2014 15:51:28 -0500 Subject: vhost: don't bother copying iovecs in handle_rx(), kill memcpy_toiovecend() Cc: Michael S. Tsirkin Cc: kvm@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Al Viro --- include/linux/uio.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index af3439f4ebf2..02bd8a92038a 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -137,7 +137,4 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct io int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, int offset, int len); -int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata, - int offset, int len); - #endif -- cgit v1.2.3 From 57dd8a0735aabff4862025cf64ad94da3d80e620 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2014 16:03:43 -0500 Subject: vhost: vhost_scsi_handle_vq() should just use copy_from_user() it has just verified that it asks no more than the length of the first segment of iovec. And with that the last user of stuff in lib/iovec.c is gone. RIP. Cc: Michael S. Tsirkin Cc: Nicholas A. Bellinger Cc: kvm@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Al Viro --- include/linux/uio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 02bd8a92038a..3e0cb4ea3905 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -135,6 +135,4 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); -int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, - int offset, int len); #endif -- cgit v1.2.3 From 2bd82484bb4c5db1d5dc983ac7c409b2782e0154 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 3 Feb 2015 23:48:24 -0800 Subject: xps: fix xps for stacked devices A typical qdisc setup is the following : bond0 : bonding device, using HTB hierarchy eth1/eth2 : slaves, multiqueue NIC, using MQ + FQ qdisc XPS allows to spread packets on specific tx queues, based on the cpu doing the send. Problem is that dequeues from bond0 qdisc can happen on random cpus, due to the fact that qdisc_run() can dequeue a batch of packets. CPUA -> queue packet P1 on bond0 qdisc, P1->ooo_okay=1 CPUA -> queue packet P2 on bond0 qdisc, P2->ooo_okay=0 CPUB -> dequeue packet P1 from bond0 enqueue packet on eth1/eth2 CPUC -> dequeue packet P2 from bond0 enqueue packet on eth1/eth2 using sk cache (ooo_okay is 0) get_xps_queue() then might select wrong queue for P1, since current cpu might be different than CPUA. P2 might be sent on the old queue (stored in sk->sk_tx_queue_mapping), if CPUC runs a bit faster (or CPUB spins a bit on qdisc lock) Effect of this bug is TCP reorders, and more generally not optimal TX queue placement. (A victim bulk flow can be migrated to the wrong TX queue for a while) To fix this, we have to record sender cpu number the first time dev_queue_xmit() is called for one tx skb. We can union napi_id (used on receive path) and sender_cpu, granted we clear sender_cpu in skb_scrub_packet() (credit to Willem for this union idea) Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Nandita Dukkipati Cc: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 85ab7d72b54c..2748ff639144 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -626,8 +626,11 @@ struct sk_buff { __u32 hash; __be16 vlan_proto; __u16 vlan_tci; -#ifdef CONFIG_NET_RX_BUSY_POLL - unsigned int napi_id; +#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) + union { + unsigned int napi_id; + unsigned int sender_cpu; + }; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; -- cgit v1.2.3 From dcdc8994697faa789669c3fdaca1a8bc27a8f356 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 2 Feb 2015 16:07:34 -0800 Subject: net: add skb functions to process remote checksum offload This patch adds skb_remcsum_process and skb_gro_remcsum_process to perform the appropriate adjustments to the skb when receiving remote checksum offload. Updated vxlan and gue to use these functions. Tested: Ran TCP_RR and TCP_STREAM netperf for VXLAN and GUE, did not see any change in performance. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 15 +++++++++++++++ include/linux/skbuff.h | 21 +++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 16251e96e6aa..1347ac50d2af 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2318,6 +2318,21 @@ do { \ compute_pseudo(skb, proto)); \ } while (0) +static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, + int start, int offset) +{ + __wsum delta; + + BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); + + delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); + NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); +} + + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2748ff639144..5405dfe02572 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3099,6 +3099,27 @@ do { \ compute_pseudo(skb, proto)); \ } while (0) +/* Update skbuf and packet to reflect the remote checksum offload operation. + * When called, ptr indicates the starting point for skb->csum when + * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete + * here, skb_postpull_rcsum is done so skb->csum start is ptr. + */ +static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr, + int start, int offset) +{ + __wsum delta; + + if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { + __skb_checksum_complete(skb); + skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data); + } + + delta = remcsum_adjust(ptr, skb->csum, start, offset); + + /* Adjust skb->csum since we changed the packet */ + skb->csum = csum_add(skb->csum, delta); +} + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) void nf_conntrack_destroy(struct nf_conntrack *nfct); static inline void nf_conntrack_put(struct nf_conntrack *nfct) -- cgit v1.2.3 From 61bd3857ff2c7daf756d49b41e6277bbdaa8f789 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 3 Feb 2015 16:48:29 +0200 Subject: net/core: Add event for a change in slave state Add event which provides an indication on a change in the state of a bonding slave. The event handler should cast the pointer to the appropriate type (struct netdev_bonding_info) in order to get the full info about the slave. Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/netdevice.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1347ac50d2af..ce784d5018e0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -51,6 +51,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -2056,6 +2057,7 @@ struct pcpu_sw_netstats { #define NETDEV_RESEND_IGMP 0x0016 #define NETDEV_PRECHANGEMTU 0x0017 /* notify before mtu change happened */ #define NETDEV_CHANGEINFODATA 0x0018 +#define NETDEV_BONDING_INFO 0x0019 int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); @@ -3494,6 +3496,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); +struct netdev_bonding_info { + ifslave slave; + ifbond master; +}; + +struct netdev_notifier_bonding_info { + struct netdev_notifier_info info; /* must be first */ + struct netdev_bonding_info bonding_info; +}; + +void netdev_bonding_info_change(struct net_device *dev, + struct netdev_bonding_info *bonding_info); + static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) { -- cgit v1.2.3 From 59e14e325066be49b49b6c2503337c69a9ee29fc Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 3 Feb 2015 16:48:32 +0200 Subject: net/mlx4_core: Port aggregation low level interface Implement the hardware interface required for port aggregation. 1. Disable RX port check on receive - don't perform a validity check that matches to QP's port and the port where the packet is received. 2. Virtual to physical port remap - configure virtual to physical port mapping. Port remap capability for virtual functions. Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/cmd.h | 7 +++++++ include/linux/mlx4/device.h | 10 +++++++++- include/linux/mlx4/qp.h | 1 + 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index ae95adc78509..7b6d4e9ff603 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -71,6 +71,7 @@ enum { /*master notify fw on finish for slave's flr*/ MLX4_CMD_INFORM_FLR_DONE = 0x5b, + MLX4_CMD_VIRT_PORT_MAP = 0x5c, MLX4_CMD_GET_OP_REQ = 0x59, /* TPT commands */ @@ -170,6 +171,12 @@ enum { MLX4_CMD_TIME_CLASS_C = 60000, }; +enum { + /* virtual to physical port mapping opcode modifiers */ + MLX4_GET_PORT_VIRT2PHY = 0x0, + MLX4_SET_PORT_VIRT2PHY = 0x1, +}; + enum { MLX4_MAILBOX_SIZE = 4096, MLX4_ACCESS_MEM_ALIGN = 256, diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index c95d659a39f2..d9afd99dde39 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -201,7 +201,8 @@ enum { MLX4_DEV_CAP_FLAG2_SYS_EQS = 1LL << 17, MLX4_DEV_CAP_FLAG2_80_VFS = 1LL << 18, MLX4_DEV_CAP_FLAG2_FS_A0 = 1LL << 19, - MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20 + MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20, + MLX4_DEV_CAP_FLAG2_PORT_REMAP = 1LL << 21 }; enum { @@ -253,9 +254,14 @@ enum { MLX4_BMME_FLAG_TYPE_2_WIN = 1 << 9, MLX4_BMME_FLAG_RESERVED_LKEY = 1 << 10, MLX4_BMME_FLAG_FAST_REG_WR = 1 << 11, + MLX4_BMME_FLAG_PORT_REMAP = 1 << 24, MLX4_BMME_FLAG_VSD_INIT2RTR = 1 << 28, }; +enum { + MLX4_FLAG_PORT_REMAP = MLX4_BMME_FLAG_PORT_REMAP +}; + enum mlx4_event { MLX4_EVENT_TYPE_COMP = 0x00, MLX4_EVENT_TYPE_PATH_MIG = 0x01, @@ -1378,6 +1384,8 @@ int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port); int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port); int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port); +int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis); +int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2); int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port); int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port); int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port, diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 467ccdf94c98..2bbc62aa818a 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -96,6 +96,7 @@ enum { MLX4_QP_BIT_RRE = 1 << 15, MLX4_QP_BIT_RWE = 1 << 14, MLX4_QP_BIT_RAE = 1 << 13, + MLX4_QP_BIT_FPP = 1 << 3, MLX4_QP_BIT_RIC = 1 << 4, }; -- cgit v1.2.3 From 53f33ae295a5098f12218da1400f55ad7df7447c Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 3 Feb 2015 16:48:33 +0200 Subject: net/mlx4_core: Port aggregation upper layer interface Supply interface functions to bond and unbond ports of a mlx4 internal interfaces. Example for such an interface is the one registered by the mlx4 IB driver under RoCE. There are 1. Functions to go in/out to/from bonded mode 2. Function to remap virtual ports to physical ports The bond_mutex prevents simultaneous access to data that keep status of the device in bonded mode. The upper mlx4 interface marks to the mlx4 core module that they want to be subject for such bonding by setting the MLX4_INTFF_BONDING flag. Interface which goes to/from bonded mode is re-created. The mlx4 Ethernet driver does not set this flag when registering the interface, the IB driver does. Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 1 + include/linux/mlx4/driver.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index d9afd99dde39..977b0b164431 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -70,6 +70,7 @@ enum { MLX4_FLAG_SLAVE = 1 << 3, MLX4_FLAG_SRIOV = 1 << 4, MLX4_FLAG_OLD_REG_MAC = 1 << 6, + MLX4_FLAG_BONDED = 1 << 7 }; enum { diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index 022055c8fb26..9553a73d2049 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -49,6 +49,10 @@ enum mlx4_dev_event { MLX4_DEV_EVENT_SLAVE_SHUTDOWN, }; +enum { + MLX4_INTFF_BONDING = 1 << 0 +}; + struct mlx4_interface { void * (*add) (struct mlx4_dev *dev); void (*remove)(struct mlx4_dev *dev, void *context); @@ -57,11 +61,26 @@ struct mlx4_interface { void * (*get_dev)(struct mlx4_dev *dev, void *context, u8 port); struct list_head list; enum mlx4_protocol protocol; + int flags; }; int mlx4_register_interface(struct mlx4_interface *intf); void mlx4_unregister_interface(struct mlx4_interface *intf); +int mlx4_bond(struct mlx4_dev *dev); +int mlx4_unbond(struct mlx4_dev *dev); +static inline int mlx4_is_bonded(struct mlx4_dev *dev) +{ + return !!(dev->flags & MLX4_FLAG_BONDED); +} + +struct mlx4_port_map { + u8 port1; + u8 port2; +}; + +int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p); + void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port); static inline u64 mlx4_mac_to_u64(u8 *addr) -- cgit v1.2.3 From f2dba9c6ff0d9a515b4c3f1b037cd65c8b2a868c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 4 Feb 2015 07:33:23 +1100 Subject: rhashtable: Introduce rhashtable_walk_* Some existing rhashtable users get too intimate with it by walking the buckets directly. This prevents us from easily changing the internals of rhashtable. This patch adds the helpers rhashtable_walk_init/exit/start/next/stop which will replace these custom walkers. They are meant to be usable for both procfs seq_file walks as well as walking by a netlink dump. The iterator structure should fit inside a netlink dump cb structure, with at least one element to spare. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index e0337844358e..58851275fed9 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -18,6 +18,7 @@ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H +#include #include #include #include @@ -111,6 +112,7 @@ struct rhashtable_params { * @p: Configuration parameters * @run_work: Deferred worker to expand/shrink asynchronously * @mutex: Mutex to protect current/future table swapping + * @walkers: List of active walkers * @being_destroyed: True if table is set up for destruction */ struct rhashtable { @@ -121,9 +123,36 @@ struct rhashtable { struct rhashtable_params p; struct work_struct run_work; struct mutex mutex; + struct list_head walkers; bool being_destroyed; }; +/** + * struct rhashtable_walker - Hash table walker + * @list: List entry on list of walkers + * @resize: Resize event occured + */ +struct rhashtable_walker { + struct list_head list; + bool resize; +}; + +/** + * struct rhashtable_iter - Hash table iterator, fits into netlink cb + * @ht: Table to iterate through + * @p: Current pointer + * @walker: Associated rhashtable walker + * @slot: Current slot + * @skip: Number of entries to skip in slot + */ +struct rhashtable_iter { + struct rhashtable *ht; + struct rhash_head *p; + struct rhashtable_walker *walker; + unsigned int slot; + unsigned int skip; +}; + static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash) { return NULLS_MARKER(ht->p.nulls_base + hash); @@ -179,6 +208,12 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht, bool (*compare)(void *, void *), void *arg); +int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter); +void rhashtable_walk_exit(struct rhashtable_iter *iter); +int rhashtable_walk_start(struct rhashtable_iter *iter) __acquires(RCU); +void *rhashtable_walk_next(struct rhashtable_iter *iter); +void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU); + void rhashtable_destroy(struct rhashtable *ht); #define rht_dereference(p, ht) \ -- cgit v1.2.3 From a9b2c06dbef48ed31cff1764c5ce824829106f4f Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 6 Feb 2015 16:04:39 -0500 Subject: tcp: mitigate ACK loops for connections as tcp_request_sock In the SYN_RECV state, where the TCP connection is represented by tcp_request_sock, we now rate-limit SYNACKs in response to a client's retransmitted SYNs: we do not send a SYNACK in response to client SYN if it has been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we last sent a SYNACK in response to a client's retransmitted SYN. This allows the vast majority of legitimate client connections to proceed unimpeded, even for the most aggressive platforms, iOS and MacOS, which actually retransmit SYNs 1-second intervals for several times in a row. They use SYN RTO timeouts following the progression: 1,1,1,1,1,2,4,8,16,32. Reported-by: Avery Fay Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 67309ece0772..bcc828d3b9b9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -115,6 +115,7 @@ struct tcp_request_sock { u32 rcv_isn; u32 snt_isn; u32 snt_synack; /* synack sent time */ + u32 last_oow_ack_time; /* last SYNACK */ u32 rcv_nxt; /* the ack # by SYNACK. For * FastOpen it's the seq# * after data-in-SYN. -- cgit v1.2.3 From f2b2c582e82429270d5818fbabe653f4359d7024 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 6 Feb 2015 16:04:40 -0500 Subject: tcp: mitigate ACK loops for connections as tcp_sock Ensure that in state ESTABLISHED, where the connection is represented by a tcp_sock, we rate limit dupacks in response to incoming packets (a) with TCP timestamps that fail PAWS checks, or (b) with sequence numbers or ACK numbers that are out of the acceptable window. We do not send a dupack in response to out-of-window packets if it has been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we last sent a dupack in response to an out-of-window packet. There is already a similar (although global) rate-limiting mechanism for "challenge ACKs". When deciding whether to send a challence ACK, we first consult the new per-connection rate limit, and then the global rate limit. Reported-by: Avery Fay Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index bcc828d3b9b9..66d85a80a1ec 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -153,6 +153,7 @@ struct tcp_sock { u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ + u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ u32 tsoffset; /* timestamp offset */ -- cgit v1.2.3 From 4fb17a6091674f469e8ac85dc770fbf9a9ba7cc8 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 6 Feb 2015 16:04:41 -0500 Subject: tcp: mitigate ACK loops for connections as tcp_timewait_sock Ensure that in state FIN_WAIT2 or TIME_WAIT, where the connection is represented by a tcp_timewait_sock, we rate limit dupacks in response to incoming packets (a) with TCP timestamps that fail PAWS checks, or (b) with sequence numbers that are out of the acceptable window. We do not send a dupack in response to out-of-window packets if it has been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we last sent a dupack in response to an out-of-window packet. Reported-by: Avery Fay Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 66d85a80a1ec..1a7adb411647 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -342,6 +342,10 @@ struct tcp_timewait_sock { u32 tw_rcv_wnd; u32 tw_ts_offset; u32 tw_ts_recent; + + /* The time we sent the last out-of-window ACK: */ + u32 tw_last_oow_ack_time; + long tw_ts_recent_stamp; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; -- cgit v1.2.3 From 096a4cfa5807aa89c78ce12309c0b1c10cf88184 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 6 Feb 2015 18:54:19 +0100 Subject: net: fix a typo in skb_checksum_validate_zero_check Remove trailing underscore. Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 111e665455c3..1bb36edb66b9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3072,7 +3072,7 @@ static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto) #define skb_checksum_validate_zero_check(skb, proto, check, \ compute_pseudo) \ - __skb_checksum_validate_(skb, proto, true, true, check, compute_pseudo) + __skb_checksum_validate(skb, proto, true, true, check, compute_pseudo) #define skb_checksum_simple_validate(skb) \ __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo) -- cgit v1.2.3 From 567e4b79731c352a17d73c483959f795d3593e03 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Feb 2015 12:59:01 -0800 Subject: net: rfs: add hash collision detection Receive Flow Steering is a nice solution but suffers from hash collisions when a mix of connected and unconnected traffic is received on the host, when flow hash table is populated. Also, clearing flow in inet_release() makes RFS not very good for short lived flows, as many packets can follow close(). (FIN , ACK packets, ...) This patch extends the information stored into global hash table to not only include cpu number, but upper part of the hash value. I use a 32bit value, and dynamically split it in two parts. For host with less than 64 possible cpus, this gives 6 bits for the cpu number, and 26 (32-6) bits for the upper part of the hash. Since hash bucket selection use low order bits of the hash, we have a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big enough. If the hash found in flow table does not match, we fallback to RPS (if it is enabled for the rxqueue). This means that a packet for an non connected flow can avoid the IPI through a unrelated/victim CPU. This also means we no longer have to clear the table at socket close time, and this helps short lived flows performance. Signed-off-by: Eric Dumazet Acked-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce784d5018e0..ab3b7cef4638 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -644,39 +644,39 @@ struct rps_dev_flow_table { /* * The rps_sock_flow_table contains mappings of flows to the last CPU * on which they were processed by the application (set in recvmsg). + * Each entry is a 32bit value. Upper part is the high order bits + * of flow hash, lower part is cpu number. + * rps_cpu_mask is used to partition the space, depending on number of + * possible cpus : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 + * For example, if 64 cpus are possible, rps_cpu_mask = 0x3f, + * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { - unsigned int mask; - u16 ents[0]; + u32 mask; + u32 ents[0]; }; -#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \ - ((_num) * sizeof(u16))) +#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) #define RPS_NO_CPU 0xffff +extern u32 rps_cpu_mask; +extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; + static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, u32 hash) { if (table && hash) { - unsigned int cpu, index = hash & table->mask; + unsigned int index = hash & table->mask; + u32 val = hash & ~rps_cpu_mask; /* We only give a hint, preemption can change cpu under us */ - cpu = raw_smp_processor_id(); + val |= raw_smp_processor_id(); - if (table->ents[index] != cpu) - table->ents[index] = cpu; + if (table->ents[index] != val) + table->ents[index] = val; } } -static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table, - u32 hash) -{ - if (table && hash) - table->ents[hash & table->mask] = RPS_NO_CPU; -} - -extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; - #ifdef CONFIG_RFS_ACCEL bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); -- cgit v1.2.3 From 93c1af6ca94c1e763efba76a127b5c135e3d23a6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Feb 2015 20:39:13 -0800 Subject: net:rfs: adjust table size checking Make sure root user does not try something stupid. Also make sure mask field in struct rps_sock_flow_table does not share a cache line with the potentially often dirtied flow table. Signed-off-by: Eric Dumazet Fixes: 567e4b79731c ("net: rfs: add hash collision detection") Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ab3b7cef4638..d115256ed5a2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -653,7 +653,8 @@ struct rps_dev_flow_table { */ struct rps_sock_flow_table { u32 mask; - u32 ents[0]; + + u32 ents[0] ____cacheline_aligned_in_smp; }; #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) -- cgit v1.2.3 From 35f05dabf95ac3ebc4c15bafd6833f7a3046e66f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 8 Feb 2015 11:49:34 +0200 Subject: IB/mlx4: Reset flow support for IB kernel ULPs The driver exposes interfaces that directly relate to HW state. Upon fatal error, consumers of these interfaces (ULPs) that rely on completion of all their posted work-request could hang, thereby introducing dependencies in shutdown order. To prevent this from happening, we manage the relevant resources (CQs, QPs) that are used by the device. Upon a fatal error, we now generate simulated completions for outstanding WQEs that were not completed at the time the HW was reset. It includes invoking the completion event handler for all involved CQs so that the ULPs will poll those CQs. When polled we return simulated CQEs with IB_WC_WR_FLUSH_ERR return code enabling ULPs to clean up their resources and not wait forever for completions upon receiving remove_one. The above change requires an extra check in the data path to make sure that when device is in error state, the simulated CQEs will be returned and no further WQEs will be posted. Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index c116cb02475c..e4ebff7e9d02 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -689,6 +689,8 @@ struct mlx4_cq { void (*comp)(struct mlx4_cq *); void *priv; } tasklet_ctx; + int reset_notify_added; + struct list_head reset_notify; }; struct mlx4_qp { -- cgit v1.2.3