From b2d0fe35471d1a71471f99147ffb5986bd60e744 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 28 Feb 2017 15:02:15 +0300
Subject: net/mlx4: && vs & typo

Bitwise & was obviously intended here.

Fixes: 745d8ae4622c ("net/mlx4: Spoofcheck and zero MAC can't coexist")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx4/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index e965e5090d96..a858bcb6220b 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -109,7 +109,7 @@ static inline void mlx4_u64_to_mac(u8 *addr, u64 mac)
 	int i;
 
 	for (i = ETH_ALEN; i > 0; i--) {
-		addr[i - 1] = mac && 0xFF;
+		addr[i - 1] = mac & 0xFF;
 		mac >>= 8;
 	}
 }
-- 
cgit v1.2.3


From 39e6c8208d7b6fb9d2047850fb3327db567b564b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Feb 2017 10:34:50 -0800
Subject: net: solve a NAPI race

While playing with mlx4 hardware timestamping of RX packets, I found
that some packets were received by TCP stack with a ~200 ms delay...

Since the timestamp was provided by the NIC, and my probe was added
in tcp_v4_rcv() while in BH handler, I was confident it was not
a sender issue, or a drop in the network.

This would happen with a very low probability, but hurting RPC
workloads.

A NAPI driver normally arms the IRQ after the napi_complete_done(),
after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab
it.

Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit
while IRQ are not disabled, we might have later an IRQ firing and
finding this bit set, right before napi_complete_done() clears it.

This can happen with busy polling users, or if gro_flush_timeout is
used. But some other uses of napi_schedule() in drivers can cause this
as well.

thread 1                                 thread 2 (could be on same cpu, or not)

// busy polling or napi_watchdog()
napi_schedule();
...
napi->poll()

device polling:
read 2 packets from ring buffer
                                          Additional 3rd packet is
available.
                                          device hard irq

                                          // does nothing because
NAPI_STATE_SCHED bit is owned by thread 1
                                          napi_schedule();

napi_complete_done(napi, 2);
rearm_irq();

Note that rearm_irq() will not force the device to send an additional
IRQ for the packet it already signaled (3rd packet in my example)

This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep()
can set if it could not grab NAPI_STATE_SCHED

Then napi_complete_done() properly reschedules the napi to make sure
we do not miss something.

Since we manipulate multiple bits at once, use cmpxchg() like in
sk_busy_loop() to provide proper transactions.

In v2, I changed napi_watchdog() to use a relaxed variant of
napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point.

In v3, I added more details in the changelog and clears
NAPI_STATE_MISSED in busy_poll_stop()

In v4, I added the ideas given by Alexander Duyck in v3 review

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f40f0ab3847a..97456b2539e4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -330,6 +330,7 @@ struct napi_struct {
 
 enum {
 	NAPI_STATE_SCHED,	/* Poll is scheduled */
+	NAPI_STATE_MISSED,	/* reschedule a napi */
 	NAPI_STATE_DISABLE,	/* Disable pending */
 	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
 	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
@@ -338,12 +339,13 @@ enum {
 };
 
 enum {
-	NAPIF_STATE_SCHED	 = (1UL << NAPI_STATE_SCHED),
-	NAPIF_STATE_DISABLE	 = (1UL << NAPI_STATE_DISABLE),
-	NAPIF_STATE_NPSVC	 = (1UL << NAPI_STATE_NPSVC),
-	NAPIF_STATE_HASHED	 = (1UL << NAPI_STATE_HASHED),
-	NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
-	NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
+	NAPIF_STATE_SCHED	 = BIT(NAPI_STATE_SCHED),
+	NAPIF_STATE_MISSED	 = BIT(NAPI_STATE_MISSED),
+	NAPIF_STATE_DISABLE	 = BIT(NAPI_STATE_DISABLE),
+	NAPIF_STATE_NPSVC	 = BIT(NAPI_STATE_NPSVC),
+	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED),
+	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
+	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
 };
 
 enum gro_result {
@@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct napi_struct *n)
 	return test_bit(NAPI_STATE_DISABLE, &n->state);
 }
 
-/**
- *	napi_schedule_prep - check if NAPI can be scheduled
- *	@n: NAPI context
- *
- * Test if NAPI routine is already running, and if not mark
- * it as running.  This is used as a condition variable to
- * insure only one NAPI poll instance runs.  We also make
- * sure there is no pending NAPI disable.
- */
-static inline bool napi_schedule_prep(struct napi_struct *n)
-{
-	return !napi_disable_pending(n) &&
-		!test_and_set_bit(NAPI_STATE_SCHED, &n->state);
-}
+bool napi_schedule_prep(struct napi_struct *n);
 
 /**
  *	napi_schedule - schedule NAPI poll
-- 
cgit v1.2.3


From eb1e011a14748a1d9df9a7d7df9a5711721a1bdb Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 15 Feb 2017 09:49:26 +0100
Subject: average: change to declare precision, not factor

Declaring the factor is counter-intuitive, and people are prone
to using small(-ish) values even when that makes no sense.

Change the DECLARE_EWMA() macro to take the fractional precision,
in bits, rather than a factor, and update all users.

While at it, add some more documentation.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/average.h | 61 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/average.h b/include/linux/average.h
index d04aa58280de..7ddaf340d2ac 100644
--- a/include/linux/average.h
+++ b/include/linux/average.h
@@ -1,45 +1,66 @@
 #ifndef _LINUX_AVERAGE_H
 #define _LINUX_AVERAGE_H
 
-/* Exponentially weighted moving average (EWMA) */
+/*
+ * Exponentially weighted moving average (EWMA)
+ *
+ * This implements a fixed-precision EWMA algorithm, with both the
+ * precision and fall-off coefficient determined at compile-time
+ * and built into the generated helper funtions.
+ *
+ * The first argument to the macro is the name that will be used
+ * for the struct and helper functions.
+ *
+ * The second argument, the precision, expresses how many bits are
+ * used for the fractional part of the fixed-precision values.
+ *
+ * The third argument, the weight reciprocal, determines how the
+ * new values will be weighed vs. the old state, new values will
+ * get weight 1/weight_rcp and old values 1-1/weight_rcp. Note
+ * that this parameter must be a power of two for efficiency.
+ */
 
-#define DECLARE_EWMA(name, _factor, _weight)				\
+#define DECLARE_EWMA(name, _precision, _weight_rcp)			\
 	struct ewma_##name {						\
 		unsigned long internal;					\
 	};								\
 	static inline void ewma_##name##_init(struct ewma_##name *e)	\
 	{								\
-		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
-		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		/*							\
+		 * Even if you want to feed it just 0/1 you should have	\
+		 * some bits for the non-fractional part...		\
+		 */							\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
 		e->internal = 0;					\
 	}								\
 	static inline unsigned long					\
 	ewma_##name##_read(struct ewma_##name *e)			\
 	{								\
-		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
-		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
-		return e->internal >> ilog2(_factor);			\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
+		return e->internal >> (_precision);			\
 	}								\
 	static inline void ewma_##name##_add(struct ewma_##name *e,	\
 					     unsigned long val)		\
 	{								\
 		unsigned long internal = ACCESS_ONCE(e->internal);	\
-		unsigned long weight = ilog2(_weight);			\
-		unsigned long factor = ilog2(_factor);			\
+		unsigned long weight_rcp = ilog2(_weight_rcp);		\
+		unsigned long precision = _precision;			\
 									\
-		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
-		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
-		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
 									\
 		ACCESS_ONCE(e->internal) = internal ?			\
-			(((internal << weight) - internal) +		\
-				(val << factor)) >> weight :		\
-			(val << factor);				\
+			(((internal << weight_rcp) - internal) +	\
+				(val << precision)) >> weight_rcp :	\
+			(val << precision);				\
 	}
 
 #endif /* _LINUX_AVERAGE_H */
-- 
cgit v1.2.3