From 52bd2d62ce6758d811edcbd2256eb9ea7f6a56cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:50 -0800 Subject: net: better skb->sender_cpu and skb->napi_id cohabitation skb->sender_cpu and skb->napi_id share a common storage, and we had various bugs about this. We had to call skb_sender_cpu_clear() in some places to not leave a prior skb->napi_id and fool netdev_pick_tx() As suggested by Alexei, we could split the space so that these errors can not happen. 0 value being reserved as the common (not initialized) value, let's reserve [1 .. NR_CPUS] range for valid sender_cpu, and [NR_CPUS+1 .. ~0U] for valid napi_id. This will allow proper busy polling support over tunnels. Signed-off-by: Eric Dumazet Suggested-by: Alexei Starovoitov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4355129fff91..c9c394bf0771 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1082,9 +1082,6 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) static inline void skb_sender_cpu_clear(struct sk_buff *skb) { -#ifdef CONFIG_XPS - skb->sender_cpu = 0; -#endif } #ifdef NET_SKBUFF_DATA_USES_OFFSET -- cgit v1.2.3 From 02d62e86fe892c59a1259d089d4d16ac76977a37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:52 -0800 Subject: net: un-inline sk_busy_loop() There is really little gain from inlining this big function. We'll soon make it even bigger in following patches. This means we no longer need to export napi_by_id() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 --------- include/net/busy_poll.h | 45 +-------------------------------------------- 2 files changed, 1 insertion(+), 53 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67bfac1abfc1..2020a89df12b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -460,15 +460,6 @@ static inline void napi_complete(struct napi_struct *n) return napi_complete_done(n, 0); } -/** - * napi_by_id - lookup a NAPI by napi_id - * @napi_id: hashed napi_id - * - * lookup @napi_id in napi_hash table - * must be called under rcu_read_lock() - */ -struct napi_struct *napi_by_id(unsigned int napi_id); - /** * napi_hash_add - add a NAPI to global hashtable * @napi: napi context diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 1d67fb6b23a0..2fbeb1313c0f 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -72,50 +72,7 @@ static inline bool busy_loop_timeout(unsigned long end_time) return time_after(now, end_time); } -/* when used in sock_poll() nonblock is known at compile time to be true - * so the loop and end_time will be optimized out - */ -static inline bool sk_busy_loop(struct sock *sk, int nonblock) -{ - unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; - const struct net_device_ops *ops; - struct napi_struct *napi; - int rc = false; - - /* - * rcu read lock for napi hash - * bh so we don't race with net_rx_action - */ - rcu_read_lock_bh(); - - napi = napi_by_id(sk->sk_napi_id); - if (!napi) - goto out; - - ops = napi->dev->netdev_ops; - if (!ops->ndo_busy_poll) - goto out; - - do { - rc = ops->ndo_busy_poll(napi); - - if (rc == LL_FLUSH_FAILED) - break; /* permanent failure */ - - if (rc > 0) - /* local bh are disabled so it is ok to use _BH */ - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); - cpu_relax(); - - } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - !need_resched() && !busy_loop_timeout(end_time)); - - rc = !skb_queue_empty(&sk->sk_receive_queue); -out: - rcu_read_unlock_bh(); - return rc; -} +bool sk_busy_loop(struct sock *sk, int nonblock); /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, -- cgit v1.2.3 From d64b5e85bfe2fe4c790abcbd16d9ae32391ddd7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:00 -0800 Subject: net: add netif_tx_napi_add() netif_tx_napi_add() is a variant of netif_napi_add() It should be used by drivers that use a napi structure to exclusively poll TX. We do not want to add this kind of napi in napi_hash[] in following patches, adding generic busy polling to all NAPI drivers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2020a89df12b..838935d1cdbb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -326,7 +326,8 @@ enum { NAPI_STATE_SCHED, /* Poll is scheduled */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ - NAPI_STATE_HASHED, /* In NAPI hash */ + NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ + NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ }; enum gro_result { @@ -1938,6 +1939,26 @@ static inline void *netdev_priv(const struct net_device *dev) void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight); +/** + * netif_tx_napi_add - initialize a napi context + * @dev: network device + * @napi: napi context + * @poll: polling function + * @weight: default weight + * + * This variant of netif_napi_add() should be used from drivers using NAPI + * to exclusively poll a TX queue. + * This will avoid we add it into napi_hash[], thus polluting this hash table. + */ +static inline void netif_tx_napi_add(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) +{ + set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); + netif_napi_add(dev, napi, poll, weight); +} + /** * netif_napi_del - remove a napi context * @napi: napi context -- cgit v1.2.3 From 6180d9de61a5c461f9e3efef5417a844701dbbb2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:01 -0800 Subject: net: move napi_hash[] into read mostly section We do not often add/delete a napi context. Moving napi_hash[] into read mostly section avoids potential false sharing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/hashtable.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h index 519b6e2d769e..661e5c2a8e2a 100644 --- a/include/linux/hashtable.h +++ b/include/linux/hashtable.h @@ -16,6 +16,10 @@ struct hlist_head name[1 << (bits)] = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } +#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] __read_mostly = \ + { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } + #define DECLARE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] -- cgit v1.2.3 From 34cbe27e811c591c854a39c0dee1b461bb796953 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:02 -0800 Subject: net: napi_hash_del() returns a boolean status napi_hash_del() will soon be used from both drivers (if they want) or core networking stack. Callers are responsibles to ensure an RCU grace period is respected before freeing napi structure : napi_hash_del() can signal if this RCU grace period is needed or not. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 838935d1cdbb..e5c33b29471b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -474,9 +474,10 @@ void napi_hash_add(struct napi_struct *napi); * @napi: napi context * * Warning: caller must observe rcu grace period - * before freeing memory containing @napi + * before freeing memory containing @napi, if + * this function returns true. */ -void napi_hash_del(struct napi_struct *napi); +bool napi_hash_del(struct napi_struct *napi); /** * napi_disable - prevent NAPI from scheduling -- cgit v1.2.3 From 93d05d4a320cb16712bb3d57a9658f395d8cecb9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:03 -0800 Subject: net: provide generic busy polling to all NAPI drivers NAPI drivers no longer need to observe a particular protocol to benefit from busy polling (CONFIG_NET_RX_BUSY_POLL=y) napi_hash_add() and napi_hash_del() are automatically called from core networking stack, respectively from netif_napi_add() and netif_napi_del() This patch depends on free_netdev() and netif_napi_del() being called from process context, which seems to be the norm. Drivers might still prefer to call napi_hash_del() on their own, since they might combine all the rcu grace periods into a single one, knowing their NAPI structures lifetime, while core networking stack has no idea of a possible combining. Once this patch proves to not bring serious regressions, we will cleanup drivers to either remove napi_hash_del() or provide appropriate rcu grace periods combining. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e5c33b29471b..7d2d1d7aaec7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -466,6 +466,9 @@ static inline void napi_complete(struct napi_struct *n) * @napi: napi context * * generate a new napi_id and store a @napi under it in napi_hash + * Used for busy polling (CONFIG_NET_RX_BUSY_POLL) + * Note: This is normally automatically done from netif_napi_add(), + * so might disappear in a future linux version. */ void napi_hash_add(struct napi_struct *napi); @@ -476,6 +479,10 @@ void napi_hash_add(struct napi_struct *napi); * Warning: caller must observe rcu grace period * before freeing memory containing @napi, if * this function returns true. + * Note: core networking stack automatically calls it + * from netif_napi_del() + * Drivers might want to call this helper to combine all + * the needed rcu grace periods into a single one. */ bool napi_hash_del(struct napi_struct *napi); -- cgit v1.2.3 From f5c4a42a7549cbc29dbac2b5ede05a3bc2bb4522 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 5 Nov 2015 07:09:59 +0100 Subject: Bluetooth: Add hci_skb_* helper wrappers for bt_cb(skb) access For all the HCI driver related variables accesssed via bt_cb(skb), provide helper wrappers. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 42844d7b154a..663e0ef1eaa0 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -316,6 +316,10 @@ struct bt_skb_cb { }; #define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb)) +#define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type +#define hci_skb_expect(skb) bt_cb((skb))->expect +#define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode + static inline struct sk_buff *bt_skb_alloc(unsigned int len, gfp_t how) { struct sk_buff *skb; -- cgit v1.2.3 From 5fcc86bd2695d4ccad2d87cb424f4c01a4e544f3 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 5 Nov 2015 09:31:39 +0200 Subject: Bluetooth: Remove redundant setting to zero of bt_cb The socket allocation functions will always memset skb->cb to zero so there's no need to make other initializations needing the same thing. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 663e0ef1eaa0..a85e6d3d75ef 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -325,10 +325,8 @@ static inline struct sk_buff *bt_skb_alloc(unsigned int len, gfp_t how) struct sk_buff *skb; skb = alloc_skb(len + BT_SKB_RESERVE, how); - if (skb) { + if (skb) skb_reserve(skb, BT_SKB_RESERVE); - bt_cb(skb)->incoming = 0; - } return skb; } @@ -338,10 +336,8 @@ static inline struct sk_buff *bt_skb_send_alloc(struct sock *sk, struct sk_buff *skb; skb = sock_alloc_send_skb(sk, len + BT_SKB_RESERVE, nb, err); - if (skb) { + if (skb) skb_reserve(skb, BT_SKB_RESERVE); - bt_cb(skb)->incoming = 0; - } if (!skb && *err) return NULL; -- cgit v1.2.3 From 44d271377479c4d4fe7f2d07d188656684773fbd Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 5 Nov 2015 09:31:40 +0200 Subject: Bluetooth: Compress the size of struct hci_ctrl We can reduce the size of the hci_ctrl struct by converting 'bool req_start' to 'u8 req_flags' and making the two function pointers a union (since only one is ever set at a time). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index a85e6d3d75ef..8d38f411009c 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -296,12 +296,17 @@ typedef void (*hci_req_complete_t)(struct hci_dev *hdev, u8 status, u16 opcode); typedef void (*hci_req_complete_skb_t)(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb); +#define HCI_REQ_START BIT(0) +#define HCI_REQ_SKB BIT(1) + struct hci_ctrl { __u16 opcode; - bool req_start; + u8 req_flags; u8 req_event; - hci_req_complete_t req_complete; - hci_req_complete_skb_t req_complete_skb; + union { + hci_req_complete_t req_complete; + hci_req_complete_skb_t req_complete_skb; + }; }; struct bt_skb_cb { -- cgit v1.2.3 From dd31506d4aece48943802c2bca3f1f7d2e7266b4 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 8 Nov 2015 07:47:12 +0100 Subject: Bluetooth: Add support for sending system notes to monitor channel The monitor channel can be used to send generic system notes as text strings for debugging purposes. This adds the system note monitor code and uses it for including kernel and subsystem version into traces. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 2 ++ include/net/bluetooth/hci_mon.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 8d38f411009c..bfd1590821d6 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -29,6 +29,8 @@ #include #include +#define BT_SUBSYS_VERSION "2.21" + #ifndef AF_BLUETOOTH #define AF_BLUETOOTH 31 #define PF_BLUETOOTH AF_BLUETOOTH diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 2b67567cf28d..c91bb23eb29e 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -43,6 +43,7 @@ struct hci_mon_hdr { #define HCI_MON_CLOSE_INDEX 9 #define HCI_MON_INDEX_INFO 10 #define HCI_MON_VENDOR_DIAG 11 +#define HCI_MON_SYSTEM_NOTE 12 struct hci_mon_new_index { __u8 type; -- cgit v1.2.3 From ac71494934c475e3f51e5e3e64a12f57618d82a4 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Sun, 8 Nov 2015 07:47:13 +0100 Subject: Bluetooth: Add support for controller specific logging To enable controller specific logging, the userspace daemon has to have the ability to log per controller. To facilitate this support, provide a dedicated logging channel. Messages in this channel will be included in the monitor queue and with that also forwarded to monitoring tools along with the actual hardware traces. All messages from the logging channel are timestamped and with that allow an easy correlation between userspace messages and hardware events. This will increase the ability to debug problems faster. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_mon.h | 1 + include/net/bluetooth/hci_sock.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index c91bb23eb29e..587d0131b349 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -44,6 +44,7 @@ struct hci_mon_hdr { #define HCI_MON_INDEX_INFO 10 #define HCI_MON_VENDOR_DIAG 11 #define HCI_MON_SYSTEM_NOTE 12 +#define HCI_MON_USER_LOGGING 13 struct hci_mon_new_index { __u8 type; diff --git a/include/net/bluetooth/hci_sock.h b/include/net/bluetooth/hci_sock.h index 9a46d665c1b5..8e9138acdae1 100644 --- a/include/net/bluetooth/hci_sock.h +++ b/include/net/bluetooth/hci_sock.h @@ -45,6 +45,7 @@ struct sockaddr_hci { #define HCI_CHANNEL_USER 1 #define HCI_CHANNEL_MONITOR 2 #define HCI_CHANNEL_CONTROL 3 +#define HCI_CHANNEL_LOGGING 4 struct hci_filter { unsigned long type_mask; -- cgit v1.2.3 From 030e7f8141a262e32dc064d7cf12377d769d45c2 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 10 Nov 2015 09:44:53 +0200 Subject: Bluetooth: Remove unnecessary call to hci_update_background_scan The hci_conn_params_clear_all() function is only called from hci_unregister_dev() at which point it's completely futile to try to do any LE scanning updates. Simply remove this unnecessary function call. At the same time we can make the function static since it's only accessed from within the same c-file. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 1878d0a96333..15e6a2bffc2b 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1036,7 +1036,6 @@ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); -void hci_conn_params_clear_all(struct hci_dev *hdev); void hci_conn_params_clear_disabled(struct hci_dev *hdev); struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, -- cgit v1.2.3 From 2e93e53b8f86fb38a9a3c3bd08e539c40b3f8d89 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 08:11:17 +0200 Subject: Bluetooth: Run all background scan updates through req_workqueue Instead of firing off a simple async request queue all background scan updates through req_workqueue and use hci_req_sync() there to ensure that no two updates overlap with each other. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 15e6a2bffc2b..c2ca6a58d1e0 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -327,6 +327,8 @@ struct hci_dev { struct work_struct cmd_work; struct work_struct tx_work; + struct work_struct bg_scan_update; + struct sk_buff_head rx_q; struct sk_buff_head raw_q; struct sk_buff_head cmd_q; -- cgit v1.2.3 From 4ebeee2dff9815619be6ff9a845d33716f48468c Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 08:11:19 +0200 Subject: Bluetooth: Add HCI status return parameter to hci_req_sync() In some cases it may be important to get the exact HCI status rather than the converted HCI-to-errno value. Add an optional return parameter to the hci_req_sync() API to allow for this. Since there are no good HCI translation candidates for cancelation and timeout, use the "unknown" status code for those cases. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 0205b80cc90b..cc2216727655 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -452,7 +452,8 @@ enum { #define HCI_ERROR_REMOTE_POWER_OFF 0x15 #define HCI_ERROR_LOCAL_HOST_TERM 0x16 #define HCI_ERROR_PAIRING_NOT_ALLOWED 0x18 -#define HCI_ERROR_INVALID_LL_PARAMS 0x1E +#define HCI_ERROR_INVALID_LL_PARAMS 0x1e +#define HCI_ERROR_UNSPECIFIED 0x1f #define HCI_ERROR_ADVERTISING_TIMEOUT 0x3c /* Flow control modes */ -- cgit v1.2.3 From 7c1fbed23981faff2840ddc8909e7c78d80ade30 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 08:11:23 +0200 Subject: Bluetooth: Move LE scan disable/restart behind req_workqueue To avoid any risks of races, place also these LE scan modification work callbacks behind the same work queue as the other LE scan changes. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c2ca6a58d1e0..1f75aebbd8c4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -328,6 +328,8 @@ struct hci_dev { struct work_struct tx_work; struct work_struct bg_scan_update; + struct delayed_work le_scan_disable; + struct delayed_work le_scan_restart; struct sk_buff_head rx_q; struct sk_buff_head raw_q; @@ -372,9 +374,6 @@ struct hci_dev { DECLARE_BITMAP(dev_flags, __HCI_NUM_FLAGS); - struct delayed_work le_scan_disable; - struct delayed_work le_scan_restart; - __s8 adv_tx_power; __u8 adv_data[HCI_MAX_AD_LENGTH]; __u8 adv_data_len; -- cgit v1.2.3 From e68f072b7396574df5324e1cf93e4b0c92460735 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 08:30:30 +0200 Subject: Bluetooth: Move Start Discovery to req_workqueue Since discovery also deals with LE scanning it makes sense to move it behind the same req_workqueue as other LE scanning changes. This also simplifies the logic since we do many of the actions in a synchronous manner. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 1f75aebbd8c4..72ea8a6d7d70 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -327,6 +327,7 @@ struct hci_dev { struct work_struct cmd_work; struct work_struct tx_work; + struct work_struct discov_update; struct work_struct bg_scan_update; struct delayed_work le_scan_disable; struct delayed_work le_scan_restart; @@ -1473,6 +1474,7 @@ void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status); void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, u8 status); void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status); +void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status); void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len); -- cgit v1.2.3 From 2154d3f4fb83c812a161c4910948dd876997e111 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 08:30:45 +0200 Subject: Bluetooth: Move Stop Discovery to req_workqueue Since discovery also deals with LE scanning it makes sense to move it behind the same req_workqueue as other LE scanning changes. This also simplifies the logic since we do many of the actions in a synchronous manner. Part of this refactoring is moving hci_req_stop_discovery() to hci_request.c. At the same time the function receives support for properly handling the STOPPING state since that's the state we'll be in when stopping through the req_workqueue. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 72ea8a6d7d70..609f4a03899c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1475,6 +1475,7 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, u8 status); void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status); void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status); +void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status); void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len); -- cgit v1.2.3 From 0ad06aa6a7682319bb1adcc187a1fa8db6b2da2c Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 11 Nov 2015 14:44:57 +0200 Subject: Bluetooth: Fix specifying role for LE connections The hci_connect_le_scan() is (as the name implies) a master/central role API, so it makes no sense in passing a role parameter to it. At the same time this patch also fixes the direct advertising support for LE L2CAP sockets where we now call the more appropriate hci_le_connect() API if slave/peripheral role is desired. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 609f4a03899c..55ce209157b1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -877,7 +877,7 @@ struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle); struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, - u16 conn_timeout, u8 role); + u16 conn_timeout); struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, u16 conn_timeout, u8 role); -- cgit v1.2.3 From a8acce6aa584aa731a2bed240bcd8dc955f01414 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 19 Nov 2015 12:53:21 +0100 Subject: ppp: remove PPPOX_ZOMBIE socket state PPPOX_ZOMBIE is never set anymore. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/linux/if_pppox.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index b49cf923becc..ba7a9b0c7c57 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -91,7 +91,6 @@ enum { PPPOX_CONNECTED = 1, /* connection established ==TCP_ESTABLISHED */ PPPOX_BOUND = 2, /* bound to ppp device */ PPPOX_RELAY = 4, /* forwarding is enabled */ - PPPOX_ZOMBIE = 8, /* dead, but still bound to ppp device */ PPPOX_DEAD = 16 /* dead, useless, please clean me up!*/ }; -- cgit v1.2.3 From b11cfb5807e30333b36c02701382b820b7dcf0d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Nov 2015 15:55:52 -0500 Subject: cgroup: record ancestor IDs and reimplement cgroup_is_descendant() using it cgroup_is_descendant() currently walks up the hierarchy and compares each ancestor to the cgroup in question. While enough for cgroup core usages, this can't be used in hot paths to test cgroup membership. This patch adds cgroup->ancestor_ids[] which records the IDs of all ancestors including self and cgroup->level for the nesting level. This allows testing whether a given cgroup is a descendant of another in three finite steps - testing whether the two belong to the same hierarchy, whether the descendant candidate is at the same or a higher level than the ancestor and comparing the recorded ancestor_id at the matching level. cgroup_is_descendant() is accordingly reimplmented and made inline. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 14 ++++++++++++++ include/linux/cgroup.h | 18 +++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 60d44b26276d..504d8591b6d3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -234,6 +234,14 @@ struct cgroup { */ int id; + /* + * The depth this cgroup is at. The root is at depth zero and each + * step down the hierarchy increments the level. This along with + * ancestor_ids[] can determine whether a given cgroup is a + * descendant of another without traversing the hierarchy. + */ + int level; + /* * Each non-empty css_set associated with this cgroup contributes * one to populated_cnt. All children with non-zero popuplated_cnt @@ -289,6 +297,9 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work; + + /* ids of the ancestors at each level including self */ + int ancestor_ids[]; }; /* @@ -308,6 +319,9 @@ struct cgroup_root { /* The root cgroup. Root is destroyed on its release. */ struct cgroup cgrp; + /* for cgrp->ancestor_ids[0] */ + int cgrp_ancestor_id_storage; + /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 22e3754f89c5..b5ee2c4210f9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -81,7 +81,6 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); @@ -459,6 +458,23 @@ static inline struct cgroup *task_cgroup(struct task_struct *task, return task_css(task, subsys_id)->cgroup; } +/** + * cgroup_is_descendant - test ancestry + * @cgrp: the cgroup to be tested + * @ancestor: possible ancestor of @cgrp + * + * Test whether @cgrp is a descendant of @ancestor. It also returns %true + * if @cgrp == @ancestor. This function is safe to call as long as @cgrp + * and @ancestor are accessible. + */ +static inline bool cgroup_is_descendant(struct cgroup *cgrp, + struct cgroup *ancestor) +{ + if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) + return false; + return cgrp->ancestor_ids[ancestor->level] == ancestor->id; +} + /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { -- cgit v1.2.3 From bd96f76a2454c6b97d70945902e30b4c31510678 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Nov 2015 15:55:52 -0500 Subject: kernfs: implement kernfs_walk_and_get() Implement kernfs_walk_and_get() which is similar to kernfs_find_and_get() but can walk a path instead of just a name. v2: Use strlcpy() instead of strlen() + memcpy() as suggested by David. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Cc: David Miller --- include/linux/kernfs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5d4e9c4b821d..af51df35d749 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -274,6 +274,8 @@ void pr_cont_kernfs_path(struct kernfs_node *kn); struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns); +struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, + const char *path, const void *ns); void kernfs_get(struct kernfs_node *kn); void kernfs_put(struct kernfs_node *kn); @@ -350,6 +352,10 @@ static inline struct kernfs_node * kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { return NULL; } +static inline struct kernfs_node * +kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, + const void *ns) +{ return NULL; } static inline void kernfs_get(struct kernfs_node *kn) { } static inline void kernfs_put(struct kernfs_node *kn) { } @@ -430,6 +436,12 @@ kernfs_find_and_get(struct kernfs_node *kn, const char *name) return kernfs_find_and_get_ns(kn, name, NULL); } +static inline struct kernfs_node * +kernfs_walk_and_get(struct kernfs_node *kn, const char *path) +{ + return kernfs_walk_and_get_ns(kn, path, NULL); +} + static inline struct kernfs_node * kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, void *priv) -- cgit v1.2.3 From 16af439645455fbf36984ca5e72f31073ee19ab7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Nov 2015 15:55:52 -0500 Subject: cgroup: implement cgroup_get_from_path() and expose cgroup_put() Implement cgroup_get_from_path() using kernfs_walk_and_get() which obtains a default hierarchy cgroup from its path. This will be used to allow cgroup path based matching from outside cgroup proper - e.g. networking and perf. v2: Add EXPORT_SYMBOL_GPL(cgroup_get_from_path). Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b5ee2c4210f9..4c3ffab81ba7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -81,6 +81,8 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); +struct cgroup *cgroup_get_from_path(const char *path); + int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); @@ -351,6 +353,11 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) percpu_ref_put_many(&css->refcnt, n); } +static inline void cgroup_put(struct cgroup *cgrp) +{ + css_put(&cgrp->self); +} + /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for -- cgit v1.2.3 From 40b25fe5dc57a6557b96241b75ae63dce716a487 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Thu, 19 Nov 2015 16:16:43 +0100 Subject: Bluetooth: Add support for Get Advertising Size Information command The Get Advertising Size Information command allows to retrieve size information for advertising data and scan response data fields depending on the selected flags. This is useful if applications want to know the available size ahead of time. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/mgmt.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index b831242d48a4..af17774c9416 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -571,6 +571,19 @@ struct mgmt_rp_remove_advertising { __u8 instance; } __packed; +#define MGMT_OP_GET_ADV_SIZE_INFO 0x0040 +struct mgmt_cp_get_adv_size_info { + __u8 instance; + __le32 flags; +} __packed; +#define MGMT_GET_ADV_SIZE_INFO_SIZE 5 +struct mgmt_rp_get_adv_size_info { + __u8 instance; + __le32 flags; + __u8 max_adv_data_len; + __u8 max_scan_rsp_len; +} __packed; + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; -- cgit v1.2.3 From b811580d91e9c0945b0a923dcec3e10cce04ac30 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 19 Nov 2015 12:24:22 -0800 Subject: net: IPv6 fib lookup tracepoint Add tracepoint to show fib6 table lookups and result. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/trace/events/fib6.h | 76 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 include/trace/events/fib6.h (limited to 'include') diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h new file mode 100644 index 000000000000..4cf6bac4686d --- /dev/null +++ b/include/trace/events/fib6.h @@ -0,0 +1,76 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fib6 + +#if !defined(_TRACE_FIB6_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FIB6_H + +#include +#include +#include +#include + +TRACE_EVENT(fib6_table_lookup, + + TP_PROTO(const struct net *net, const struct rt6_info *rt, + u32 tb_id, const struct flowi6 *flp), + + TP_ARGS(net, rt, tb_id, flp), + + TP_STRUCT__entry( + __field( u32, tb_id ) + + __field( int, oif ) + __field( int, iif ) + __field( __u8, tos ) + __field( __u8, scope ) + __field( __u8, flags ) + __array( __u8, src, 16 ) + __array( __u8, dst, 16 ) + + __dynamic_array( char, name, IFNAMSIZ ) + __array( __u8, gw, 16 ) + ), + + TP_fast_assign( + struct in6_addr *in6; + + __entry->tb_id = tb_id; + __entry->oif = flp->flowi6_oif; + __entry->iif = flp->flowi6_iif; + __entry->tos = flp->flowi6_tos; + __entry->scope = flp->flowi6_scope; + __entry->flags = flp->flowi6_flags; + + in6 = (struct in6_addr *)__entry->src; + *in6 = flp->saddr; + + in6 = (struct in6_addr *)__entry->dst; + *in6 = flp->daddr; + + if (rt->rt6i_idev) { + __assign_str(name, rt->rt6i_idev->dev->name); + } else { + __assign_str(name, ""); + } + if (rt == net->ipv6.ip6_null_entry) { + struct in6_addr in6_zero = {}; + + in6 = (struct in6_addr *)__entry->gw; + *in6 = in6_zero; + + } else if (rt) { + in6 = (struct in6_addr *)__entry->gw; + *in6 = rt->rt6i_gateway; + } + ), + + TP_printk("table %3u oif %d iif %d src %pI6c dst %pI6c tos %d scope %d flags %x ==> dev %s gw %pI6c", + __entry->tb_id, __entry->oif, __entry->iif, + __entry->src, __entry->dst, __entry->tos, __entry->scope, + __entry->flags, __get_str(name), __entry->gw) +); + +#endif /* _TRACE_FIB6_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From cc30c16344fc3a25153175c7eb9037b2136cd466 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 20 Nov 2015 03:56:23 +0100 Subject: net: dsa: Add support for a switch reset gpio Some boards have a gpio line tied to the switch reset pin. Allow this gpio to be retrieved from the device tree, and take the switch out of reset before performing the probe. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 82a4c6011173..3f23dd9d6a69 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,13 @@ struct dsa_chip_data { * NULL if there is only one switch chip. */ s8 *rtable; + + /* + * A switch may have a GPIO line tied to its reset pin. Parse + * this from the device tree, and use it before performing + * switch soft reset. + */ + struct gpio_desc *reset; }; struct dsa_platform_data { -- cgit v1.2.3 From dad1581944139bb104965340804d1fb4518aab2c Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Thu, 15 Oct 2015 07:55:59 +0200 Subject: netfilter: ebtables: use __u64 from linux/types.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes userspace compilation error: linux/netfilter_bridge/ebtables.h:38:2: error: unknown type name ‘uint64_t’ Signed-off-by: Mikko Rapeli Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebtables.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index fd2ee501726d..e3cdf9f1a259 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -12,6 +12,8 @@ #ifndef _UAPI__LINUX_BRIDGE_EFF_H #define _UAPI__LINUX_BRIDGE_EFF_H +#include +#include #include #define EBT_TABLE_MAXNAMELEN 32 @@ -33,8 +35,8 @@ struct xt_match; struct xt_target; struct ebt_counter { - uint64_t pcnt; - uint64_t bcnt; + __u64 pcnt; + __u64 bcnt; }; struct ebt_replace { -- cgit v1.2.3 From 1ffad83dffd675cd742286ae82dca7d746cb0da8 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Thu, 15 Oct 2015 07:56:30 +0200 Subject: netfilter: fix include files for compilation Add missing header dependencies and other small changes so that each file compiles alone in userspace. Signed-off-by: Mikko Rapeli Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/ipset/ip_set_bitmap.h | 2 ++ include/uapi/linux/netfilter/ipset/ip_set_hash.h | 2 ++ include/uapi/linux/netfilter/ipset/ip_set_list.h | 2 ++ include/uapi/linux/netfilter/nf_conntrack_tuple_common.h | 3 +++ include/uapi/linux/netfilter/xt_HMARK.h | 1 + include/uapi/linux/netfilter/xt_RATEEST.h | 1 + include/uapi/linux/netfilter/xt_TEE.h | 2 ++ include/uapi/linux/netfilter/xt_TPROXY.h | 1 + include/uapi/linux/netfilter/xt_hashlimit.h | 1 + include/uapi/linux/netfilter/xt_ipvs.h | 1 + include/uapi/linux/netfilter/xt_mac.h | 2 ++ include/uapi/linux/netfilter/xt_osf.h | 2 ++ include/uapi/linux/netfilter/xt_physdev.h | 2 +- include/uapi/linux/netfilter/xt_policy.h | 2 ++ include/uapi/linux/netfilter/xt_rateest.h | 1 + include/uapi/linux/netfilter/xt_recent.h | 1 + include/uapi/linux/netfilter/xt_sctp.h | 12 ++++++------ include/uapi/linux/netfilter_arp/arp_tables.h | 1 + include/uapi/linux/netfilter_bridge.h | 1 + include/uapi/linux/netfilter_bridge/ebt_arp.h | 1 + include/uapi/linux/netfilter_bridge/ebt_arpreply.h | 2 ++ include/uapi/linux/netfilter_bridge/ebt_ip6.h | 1 + include/uapi/linux/netfilter_bridge/ebt_nat.h | 2 ++ include/uapi/linux/netfilter_ipv4/ip_tables.h | 1 + include/uapi/linux/netfilter_ipv6/ip6_tables.h | 1 + include/uapi/linux/netfilter_ipv6/ip6t_rt.h | 2 +- 26 files changed, 42 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/ipset/ip_set_bitmap.h b/include/uapi/linux/netfilter/ipset/ip_set_bitmap.h index 6a2c038d1888..fd5024d26269 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set_bitmap.h +++ b/include/uapi/linux/netfilter/ipset/ip_set_bitmap.h @@ -1,6 +1,8 @@ #ifndef _UAPI__IP_SET_BITMAP_H #define _UAPI__IP_SET_BITMAP_H +#include + /* Bitmap type specific error codes */ enum { /* The element is out of the range of the set */ diff --git a/include/uapi/linux/netfilter/ipset/ip_set_hash.h b/include/uapi/linux/netfilter/ipset/ip_set_hash.h index 352eeccdc7f2..82deeb883ac4 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set_hash.h +++ b/include/uapi/linux/netfilter/ipset/ip_set_hash.h @@ -1,6 +1,8 @@ #ifndef _UAPI__IP_SET_HASH_H #define _UAPI__IP_SET_HASH_H +#include + /* Hash type specific error codes */ enum { /* Hash is full */ diff --git a/include/uapi/linux/netfilter/ipset/ip_set_list.h b/include/uapi/linux/netfilter/ipset/ip_set_list.h index a44efaa98213..84d430368266 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set_list.h +++ b/include/uapi/linux/netfilter/ipset/ip_set_list.h @@ -1,6 +1,8 @@ #ifndef _UAPI__IP_SET_LIST_H #define _UAPI__IP_SET_LIST_H +#include + /* List type specific error codes */ enum { /* Set name to be added/deleted/tested does not exist. */ diff --git a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h index 2f6bbc5b8125..a9c3834abdd4 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h @@ -1,6 +1,9 @@ #ifndef _NF_CONNTRACK_TUPLE_COMMON_H #define _NF_CONNTRACK_TUPLE_COMMON_H +#include +#include + enum ip_conntrack_dir { IP_CT_DIR_ORIGINAL, IP_CT_DIR_REPLY, diff --git a/include/uapi/linux/netfilter/xt_HMARK.h b/include/uapi/linux/netfilter/xt_HMARK.h index 826fc5807577..3fb48c8d8d78 100644 --- a/include/uapi/linux/netfilter/xt_HMARK.h +++ b/include/uapi/linux/netfilter/xt_HMARK.h @@ -2,6 +2,7 @@ #define XT_HMARK_H_ #include +#include enum { XT_HMARK_SADDR_MASK, diff --git a/include/uapi/linux/netfilter/xt_RATEEST.h b/include/uapi/linux/netfilter/xt_RATEEST.h index 6605e20ad8cf..ec1b57047e03 100644 --- a/include/uapi/linux/netfilter/xt_RATEEST.h +++ b/include/uapi/linux/netfilter/xt_RATEEST.h @@ -2,6 +2,7 @@ #define _XT_RATEEST_TARGET_H #include +#include struct xt_rateest_target_info { char name[IFNAMSIZ]; diff --git a/include/uapi/linux/netfilter/xt_TEE.h b/include/uapi/linux/netfilter/xt_TEE.h index 5c21d5c829af..01092023404b 100644 --- a/include/uapi/linux/netfilter/xt_TEE.h +++ b/include/uapi/linux/netfilter/xt_TEE.h @@ -1,6 +1,8 @@ #ifndef _XT_TEE_TARGET_H #define _XT_TEE_TARGET_H +#include + struct xt_tee_tginfo { union nf_inet_addr gw; char oif[16]; diff --git a/include/uapi/linux/netfilter/xt_TPROXY.h b/include/uapi/linux/netfilter/xt_TPROXY.h index 902043c2073f..8d693eefdc1f 100644 --- a/include/uapi/linux/netfilter/xt_TPROXY.h +++ b/include/uapi/linux/netfilter/xt_TPROXY.h @@ -2,6 +2,7 @@ #define _XT_TPROXY_H #include +#include /* TPROXY target is capable of marking the packet to perform * redirection. We can get rid of that whenever we get support for diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h index cbfc43d1af68..6db90372f09c 100644 --- a/include/uapi/linux/netfilter/xt_hashlimit.h +++ b/include/uapi/linux/netfilter/xt_hashlimit.h @@ -2,6 +2,7 @@ #define _UAPI_XT_HASHLIMIT_H #include +#include /* timings are in milliseconds. */ #define XT_HASHLIMIT_SCALE 10000 diff --git a/include/uapi/linux/netfilter/xt_ipvs.h b/include/uapi/linux/netfilter/xt_ipvs.h index eff34ac18808..e03b9c31a39d 100644 --- a/include/uapi/linux/netfilter/xt_ipvs.h +++ b/include/uapi/linux/netfilter/xt_ipvs.h @@ -2,6 +2,7 @@ #define _XT_IPVS_H #include +#include enum { XT_IPVS_IPVS_PROPERTY = 1 << 0, /* all other options imply this one */ diff --git a/include/uapi/linux/netfilter/xt_mac.h b/include/uapi/linux/netfilter/xt_mac.h index b892cdc67e06..9a19a08a9181 100644 --- a/include/uapi/linux/netfilter/xt_mac.h +++ b/include/uapi/linux/netfilter/xt_mac.h @@ -1,6 +1,8 @@ #ifndef _XT_MAC_H #define _XT_MAC_H +#include + struct xt_mac_info { unsigned char srcaddr[ETH_ALEN]; int invert; diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h index 5d66caeba3ee..e6159958b2fb 100644 --- a/include/uapi/linux/netfilter/xt_osf.h +++ b/include/uapi/linux/netfilter/xt_osf.h @@ -20,6 +20,8 @@ #define _XT_OSF_H #include +#include +#include #define MAXGENRELEN 32 diff --git a/include/uapi/linux/netfilter/xt_physdev.h b/include/uapi/linux/netfilter/xt_physdev.h index db7a2982e9c0..ccdde87da214 100644 --- a/include/uapi/linux/netfilter/xt_physdev.h +++ b/include/uapi/linux/netfilter/xt_physdev.h @@ -2,7 +2,7 @@ #define _UAPI_XT_PHYSDEV_H #include - +#include #define XT_PHYSDEV_OP_IN 0x01 #define XT_PHYSDEV_OP_OUT 0x02 diff --git a/include/uapi/linux/netfilter/xt_policy.h b/include/uapi/linux/netfilter/xt_policy.h index be8ead05c316..d8a9800dce61 100644 --- a/include/uapi/linux/netfilter/xt_policy.h +++ b/include/uapi/linux/netfilter/xt_policy.h @@ -2,6 +2,8 @@ #define _XT_POLICY_H #include +#include +#include #define XT_POLICY_MAX_ELEM 4 diff --git a/include/uapi/linux/netfilter/xt_rateest.h b/include/uapi/linux/netfilter/xt_rateest.h index d40a6196842a..13fe50d4e4b3 100644 --- a/include/uapi/linux/netfilter/xt_rateest.h +++ b/include/uapi/linux/netfilter/xt_rateest.h @@ -2,6 +2,7 @@ #define _XT_RATEEST_MATCH_H #include +#include enum xt_rateest_match_flags { XT_RATEEST_MATCH_INVERT = 1<<0, diff --git a/include/uapi/linux/netfilter/xt_recent.h b/include/uapi/linux/netfilter/xt_recent.h index 6ef36c113e89..955d562031cc 100644 --- a/include/uapi/linux/netfilter/xt_recent.h +++ b/include/uapi/linux/netfilter/xt_recent.h @@ -2,6 +2,7 @@ #define _LINUX_NETFILTER_XT_RECENT_H 1 #include +#include enum { XT_RECENT_CHECK = 1 << 0, diff --git a/include/uapi/linux/netfilter/xt_sctp.h b/include/uapi/linux/netfilter/xt_sctp.h index 29287be696a2..58ffcfb7978e 100644 --- a/include/uapi/linux/netfilter/xt_sctp.h +++ b/include/uapi/linux/netfilter/xt_sctp.h @@ -66,26 +66,26 @@ struct xt_sctp_info { #define SCTP_CHUNKMAP_IS_CLEAR(chunkmap) \ __sctp_chunkmap_is_clear((chunkmap), ARRAY_SIZE(chunkmap)) -static inline bool +static inline _Bool __sctp_chunkmap_is_clear(const __u32 *chunkmap, unsigned int n) { unsigned int i; for (i = 0; i < n; ++i) if (chunkmap[i]) - return false; - return true; + return 0; + return 1; } #define SCTP_CHUNKMAP_IS_ALL_SET(chunkmap) \ __sctp_chunkmap_is_all_set((chunkmap), ARRAY_SIZE(chunkmap)) -static inline bool +static inline _Bool __sctp_chunkmap_is_all_set(const __u32 *chunkmap, unsigned int n) { unsigned int i; for (i = 0; i < n; ++i) if (chunkmap[i] != ~0U) - return false; - return true; + return 0; + return 1; } #endif /* _XT_SCTP_H_ */ diff --git a/include/uapi/linux/netfilter_arp/arp_tables.h b/include/uapi/linux/netfilter_arp/arp_tables.h index a5a86a4db6b3..ece3ad4eecda 100644 --- a/include/uapi/linux/netfilter_arp/arp_tables.h +++ b/include/uapi/linux/netfilter_arp/arp_tables.h @@ -11,6 +11,7 @@ #include #include +#include #include #include diff --git a/include/uapi/linux/netfilter_bridge.h b/include/uapi/linux/netfilter_bridge.h index a5eda6db8d79..514519b47651 100644 --- a/include/uapi/linux/netfilter_bridge.h +++ b/include/uapi/linux/netfilter_bridge.h @@ -4,6 +4,7 @@ /* bridge-specific defines for netfilter. */ +#include #include #include #include diff --git a/include/uapi/linux/netfilter_bridge/ebt_arp.h b/include/uapi/linux/netfilter_bridge/ebt_arp.h index 522f3e427f49..dd4df25330e8 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_arp.h +++ b/include/uapi/linux/netfilter_bridge/ebt_arp.h @@ -2,6 +2,7 @@ #define __LINUX_BRIDGE_EBT_ARP_H #include +#include #define EBT_ARP_OPCODE 0x01 #define EBT_ARP_HTYPE 0x02 diff --git a/include/uapi/linux/netfilter_bridge/ebt_arpreply.h b/include/uapi/linux/netfilter_bridge/ebt_arpreply.h index 7e77896e1fbf..6fee3402e307 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_arpreply.h +++ b/include/uapi/linux/netfilter_bridge/ebt_arpreply.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_ARPREPLY_H #define __LINUX_BRIDGE_EBT_ARPREPLY_H +#include + struct ebt_arpreply_info { unsigned char mac[ETH_ALEN]; int target; diff --git a/include/uapi/linux/netfilter_bridge/ebt_ip6.h b/include/uapi/linux/netfilter_bridge/ebt_ip6.h index 42b889682721..a062f0ce95f9 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_ip6.h +++ b/include/uapi/linux/netfilter_bridge/ebt_ip6.h @@ -13,6 +13,7 @@ #define __LINUX_BRIDGE_EBT_IP6_H #include +#include #define EBT_IP6_SOURCE 0x01 #define EBT_IP6_DEST 0x02 diff --git a/include/uapi/linux/netfilter_bridge/ebt_nat.h b/include/uapi/linux/netfilter_bridge/ebt_nat.h index 5e74e3b03bd6..c990d74ee966 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_nat.h +++ b/include/uapi/linux/netfilter_bridge/ebt_nat.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_NAT_H #define __LINUX_BRIDGE_EBT_NAT_H +#include + #define NAT_ARP_BIT (0x00000010) struct ebt_nat_info { unsigned char mac[ETH_ALEN]; diff --git a/include/uapi/linux/netfilter_ipv4/ip_tables.h b/include/uapi/linux/netfilter_ipv4/ip_tables.h index f1e6ef256034..d0da53d96d93 100644 --- a/include/uapi/linux/netfilter_ipv4/ip_tables.h +++ b/include/uapi/linux/netfilter_ipv4/ip_tables.h @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/include/uapi/linux/netfilter_ipv6/ip6_tables.h b/include/uapi/linux/netfilter_ipv6/ip6_tables.h index 649c68062dca..d1b22653daf2 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6_tables.h +++ b/include/uapi/linux/netfilter_ipv6/ip6_tables.h @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_rt.h b/include/uapi/linux/netfilter_ipv6/ip6t_rt.h index 7605a5ff81cd..558f81e46fb9 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6t_rt.h +++ b/include/uapi/linux/netfilter_ipv6/ip6t_rt.h @@ -2,7 +2,7 @@ #define _IP6T_RT_H #include -/*#include */ +#include #define IP6T_RT_HOPS 16 -- cgit v1.2.3 From f7ccdb96fa31305d480678b1ba81225907dd81ef Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 11 Nov 2015 20:17:37 -0200 Subject: netfilter: nf_ct_sctp: move ip_ct_sctp away from UAPI ip_ct_sctp is an internal structure, embedded by the union nf_conntrack_proto to store sctp-specific information at conntrack entries. It has no business with UAPI. This patch moves it from UAPI to a saner place, together with similar structs for other protocols. Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_sctp.h | 13 +++++++++++++ include/uapi/linux/netfilter/nf_conntrack_sctp.h | 12 +++--------- 2 files changed, 16 insertions(+), 9 deletions(-) create mode 100644 include/linux/netfilter/nf_conntrack_sctp.h (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h new file mode 100644 index 000000000000..22a16a23cd8a --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_sctp.h @@ -0,0 +1,13 @@ +#ifndef _NF_CONNTRACK_SCTP_H +#define _NF_CONNTRACK_SCTP_H +/* SCTP tracking. */ + +#include + +struct ip_ct_sctp { + enum sctp_conntrack state; + + __be32 vtag[IP_CT_DIR_MAX]; +}; + +#endif /* _NF_CONNTRACK_SCTP_H */ diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h index ed4e776e1242..2cbc366c3fb4 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h @@ -1,5 +1,5 @@ -#ifndef _NF_CONNTRACK_SCTP_H -#define _NF_CONNTRACK_SCTP_H +#ifndef _UAPI_NF_CONNTRACK_SCTP_H +#define _UAPI_NF_CONNTRACK_SCTP_H /* SCTP tracking. */ #include @@ -18,10 +18,4 @@ enum sctp_conntrack { SCTP_CONNTRACK_MAX }; -struct ip_ct_sctp { - enum sctp_conntrack state; - - __be32 vtag[IP_CT_DIR_MAX]; -}; - -#endif /* _NF_CONNTRACK_SCTP_H */ +#endif /* _UAPI_NF_CONNTRACK_SCTP_H */ -- cgit v1.2.3 From 029f7f3b8701cc7aca8bdb31f0c7edd6a479e357 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Nov 2015 23:32:39 +0100 Subject: netfilter: ipv6: nf_defrag: avoid/free clone operations commit 6aafeef03b9d9ecf ("netfilter: push reasm skb through instead of original frag skbs") changed ipv6 defrag to not use the original skbs anymore. So rather than keeping the original skbs around just to discard them afterwards just use the original skbs directly for the fraglist of the newly assembled skb and remove the extra clone/free operations. The skb that completes the fragment queue is morphed into a the reassembled one instead, just like ipv4 defrag. openvswitch doesn't need any additional skb_morph magic anymore to deal with this situation so just remove that. A followup patch can then also remove the NF_HOOK (re)invocation in the ipv6 netfilter defrag hook. Cc: Joe Stringer Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv6/nf_defrag_ipv6.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index fb7da5bb76cc..fcd20cf8f5d5 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -6,7 +6,6 @@ void nf_defrag_ipv6_enable(void); int nf_ct_frag6_init(void); void nf_ct_frag6_cleanup(void); struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user); -void nf_ct_frag6_consume_orig(struct sk_buff *skb); struct inet_frags_ctl; -- cgit v1.2.3 From daaa7d647f81f3f1494d9a9029d611b666d63181 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Nov 2015 23:32:40 +0100 Subject: netfilter: ipv6: avoid nf_iterate recursion The previous patch changed nf_ct_frag6_gather() to morph reassembled skb with the previous one. This means that the return value is always NULL or the skb argument. So change it to an err value. Instead of invoking NF_HOOK recursively with threshold to skip already-called hooks we can now just return NF_ACCEPT to move on to the next hook except for -EINPROGRESS (which means skb has been queued for reassembly), in which case we return NF_STOLEN. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv6/nf_defrag_ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index fcd20cf8f5d5..ddf162f7966f 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -5,7 +5,7 @@ void nf_defrag_ipv6_enable(void); int nf_ct_frag6_init(void); void nf_ct_frag6_cleanup(void); -struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user); +int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user); struct inet_frags_ctl; -- cgit v1.2.3 From 7ef8f65df976369588fa1b6466668b1b6a26eb3c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sat, 21 Nov 2015 15:57:27 +0100 Subject: net: ipmr: fix code and comment style Trivial code and comment style fixes, also removed some extra newlines, spaces and tabs. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/mroute.h | 59 +++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index a382d2c04a42..cf943016930f 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -4,15 +4,13 @@ #include #include -/* - * Based on the MROUTING 3.5 defines primarily to keep - * source compatibility with BSD. +/* Based on the MROUTING 3.5 defines primarily to keep + * source compatibility with BSD. * - * See the mrouted code for the original history. - * - * Protocol Independent Multicast (PIM) data structures included - * Carlos Picoto (cap@di.fc.ul.pt) + * See the mrouted code for the original history. * + * Protocol Independent Multicast (PIM) data structures included + * Carlos Picoto (cap@di.fc.ul.pt) */ #define MRT_BASE 200 @@ -34,15 +32,13 @@ #define SIOCGETSGCNT (SIOCPROTOPRIVATE+1) #define SIOCGETRPF (SIOCPROTOPRIVATE+2) -#define MAXVIFS 32 +#define MAXVIFS 32 typedef unsigned long vifbitmap_t; /* User mode code depends on this lot */ typedef unsigned short vifi_t; #define ALL_VIFS ((vifi_t)(-1)) -/* - * Same idea as select - */ - +/* Same idea as select */ + #define VIFM_SET(n,m) ((m)|=(1<<(n))) #define VIFM_CLR(n,m) ((m)&=~(1<<(n))) #define VIFM_ISSET(n,m) ((m)&(1<<(n))) @@ -50,11 +46,9 @@ typedef unsigned short vifi_t; #define VIFM_COPY(mfrom,mto) ((mto)=(mfrom)) #define VIFM_SAME(m1,m2) ((m1)==(m2)) -/* - * Passed by mrouted for an MRT_ADD_VIF - again we use the - * mrouted 3.6 structures for compatibility +/* Passed by mrouted for an MRT_ADD_VIF - again we use the + * mrouted 3.6 structures for compatibility */ - struct vifctl { vifi_t vifc_vifi; /* Index of VIF */ unsigned char vifc_flags; /* VIFF_ flags */ @@ -73,10 +67,7 @@ struct vifctl { #define VIFF_USE_IFINDEX 0x8 /* use vifc_lcl_ifindex instead of vifc_lcl_addr to find an interface */ -/* - * Cache manipulation structures for mrouted and PIMd - */ - +/* Cache manipulation structures for mrouted and PIMd */ struct mfcctl { struct in_addr mfcc_origin; /* Origin of mcast */ struct in_addr mfcc_mcastgrp; /* Group in question */ @@ -88,10 +79,7 @@ struct mfcctl { int mfcc_expire; }; -/* - * Group count retrieval for mrouted - */ - +/* Group count retrieval for mrouted */ struct sioc_sg_req { struct in_addr src; struct in_addr grp; @@ -100,10 +88,7 @@ struct sioc_sg_req { unsigned long wrong_if; }; -/* - * To get vif packet counts - */ - +/* To get vif packet counts */ struct sioc_vif_req { vifi_t vifi; /* Which iface */ unsigned long icount; /* In packets */ @@ -112,11 +97,9 @@ struct sioc_vif_req { unsigned long obytes; /* Out bytes */ }; -/* - * This is the format the mroute daemon expects to see IGMP control - * data. Magically happens to be like an IP packet as per the original +/* This is the format the mroute daemon expects to see IGMP control + * data. Magically happens to be like an IP packet as per the original */ - struct igmpmsg { __u32 unused1,unused2; unsigned char im_msgtype; /* What is this */ @@ -126,21 +109,13 @@ struct igmpmsg { struct in_addr im_src,im_dst; }; -/* - * That's all usermode folks - */ - - +/* That's all usermode folks */ #define MFC_ASSERT_THRESH (3*HZ) /* Maximal freq. of asserts */ -/* - * Pseudo messages used by mrouted - */ - +/* Pseudo messages used by mrouted */ #define IGMPMSG_NOCACHE 1 /* Kern cache fill request to mrouted */ #define IGMPMSG_WRONGVIF 2 /* For PIM assert processing (unused) */ #define IGMPMSG_WHOLEPKT 3 /* For PIM Register processing */ - #endif /* _UAPI__LINUX_MROUTE_H */ -- cgit v1.2.3 From a9ecfbe7fcf2f600718c3f995cbb46043f7a7a5d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 24 Nov 2015 00:03:28 +0100 Subject: netfilter: nf_tables: remove unused struct members Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4bd7508bedc9..101d7d7ec243 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -19,8 +19,6 @@ struct nft_pktinfo { const struct net_device *out; u8 pf; u8 hook; - u8 nhoff; - u8 thoff; u8 tprot; /* for x_tables compatibility */ struct xt_action_param xt; -- cgit v1.2.3 From 7ec3f7b47b8d9ad7ba425726f2c58f9ddce040df Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 24 Nov 2015 10:00:22 +0000 Subject: netfilter: nft_payload: add packet mangling support Add support for mangling packet payload. Checksum for the specified base header is updated automatically if requested, however no updates for any kind of pseudo headers are supported, meaning no stateless NAT is supported. For checksum updates different checksumming methods can be specified. The currently supported methods are NONE for no checksum updates, and INET for internet type checksums. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 9 +++++++++ include/uapi/linux/netfilter/nf_tables.h | 17 +++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index c6f400cfaac8..4ff5424909aa 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -47,6 +47,15 @@ struct nft_payload { enum nft_registers dreg:8; }; +struct nft_payload_set { + enum nft_payload_bases base:8; + u8 offset; + u8 len; + enum nft_registers sreg:8; + u8 csum_type; + u8 csum_offset; +}; + extern const struct nft_expr_ops nft_payload_fast_ops; int nft_payload_module_init(void); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index d8c8a7c9d88a..5f3ececf84b3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -597,6 +597,17 @@ enum nft_payload_bases { NFT_PAYLOAD_TRANSPORT_HEADER, }; +/** + * enum nft_payload_csum_types - nf_tables payload expression checksum types + * + * @NFT_PAYLOAD_CSUM_NONE: no checksumming + * @NFT_PAYLOAD_CSUM_INET: internet checksum (RFC 791) + */ +enum nft_payload_csum_types { + NFT_PAYLOAD_CSUM_NONE, + NFT_PAYLOAD_CSUM_INET, +}; + /** * enum nft_payload_attributes - nf_tables payload expression netlink attributes * @@ -604,6 +615,9 @@ enum nft_payload_bases { * @NFTA_PAYLOAD_BASE: payload base (NLA_U32: nft_payload_bases) * @NFTA_PAYLOAD_OFFSET: payload offset relative to base (NLA_U32) * @NFTA_PAYLOAD_LEN: payload length (NLA_U32) + * @NFTA_PAYLOAD_SREG: source register to load data from (NLA_U32: nft_registers) + * @NFTA_PAYLOAD_CSUM_TYPE: checksum type (NLA_U32) + * @NFTA_PAYLOAD_CSUM_OFFSET: checksum offset relative to base (NLA_U32) */ enum nft_payload_attributes { NFTA_PAYLOAD_UNSPEC, @@ -611,6 +625,9 @@ enum nft_payload_attributes { NFTA_PAYLOAD_BASE, NFTA_PAYLOAD_OFFSET, NFTA_PAYLOAD_LEN, + NFTA_PAYLOAD_SREG, + NFTA_PAYLOAD_CSUM_TYPE, + NFTA_PAYLOAD_CSUM_OFFSET, __NFTA_PAYLOAD_MAX }; #define NFTA_PAYLOAD_MAX (__NFTA_PAYLOAD_MAX - 1) -- cgit v1.2.3 From 9458ceab49179b7fd2d5192fd9dcf316ca195dc0 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 24 Nov 2015 15:30:21 -0800 Subject: net: phy: bcm7xxx: Add entry for Broadcom BCM7435 Add a PHY entry for the Broadcom BCM7435 chips, this is a 40nm generation Ethernet PHY which is analogous to its 7425 and 7429 counter parts. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 59f4a7304419..f0ba9c2ec639 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -26,6 +26,7 @@ #define PHY_ID_BCM7366 0x600d8490 #define PHY_ID_BCM7425 0x600d86b0 #define PHY_ID_BCM7429 0x600d8730 +#define PHY_ID_BCM7435 0x600d8750 #define PHY_ID_BCM7439 0x600d8480 #define PHY_ID_BCM7439_2 0xae025080 #define PHY_ID_BCM7445 0x600d8510 -- cgit v1.2.3 From 1ce0bf50ae2233c7115a18c0c623662d177b434c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 26 Nov 2015 13:55:39 +0800 Subject: net: Generalise wq_has_sleeper helper The memory barrier in the helper wq_has_sleeper is needed by just about every user of waitqueue_active. This patch generalises it by making it take a wait_queue_head_t directly. The existing helper is renamed to skwq_has_sleeper. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/wait.h | 21 +++++++++++++++++++++ include/net/sock.h | 15 +++++---------- 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 1e1bf9f963a9..6aa09a875fbd 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -107,6 +107,27 @@ static inline int waitqueue_active(wait_queue_head_t *q) return !list_empty(&q->task_list); } +/** + * wq_has_sleeper - check if there are any waiting processes + * @wq: wait queue head + * + * Returns true if wq has waiting processes + * + * Please refer to the comment for waitqueue_active. + */ +static inline bool wq_has_sleeper(wait_queue_head_t *wq) +{ + /* + * We need to be sure we are in sync with the + * add_wait_queue modifications to the wait queue. + * + * This memory barrier should be paired with one on the + * waiting side. + */ + smp_mb(); + return waitqueue_active(wq); +} + extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait); extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); diff --git a/include/net/sock.h b/include/net/sock.h index 7f89e4ba18d1..62d35afcb3ac 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -1879,12 +1880,12 @@ static inline bool sk_has_allocations(const struct sock *sk) } /** - * wq_has_sleeper - check if there are any waiting processes + * skwq_has_sleeper - check if there are any waiting processes * @wq: struct socket_wq * * Returns true if socket_wq has waiting processes * - * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory + * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths: @@ -1910,15 +1911,9 @@ static inline bool sk_has_allocations(const struct sock *sk) * data on the socket. * */ -static inline bool wq_has_sleeper(struct socket_wq *wq) +static inline bool skwq_has_sleeper(struct socket_wq *wq) { - /* We need to be sure we are in sync with the - * add_wait_queue modifications to the wait queue. - * - * This memory barrier is paired in the sock_poll_wait. - */ - smp_mb(); - return wq && waitqueue_active(&wq->wait); + return wq && wq_has_sleeper(&wq->wait); } /** -- cgit v1.2.3 From 06bd6c0370bb88a2256c6763a32bc4e4ade06521 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:45 +0100 Subject: net: ipmr: remove unused MFC_NOTIFY flag and make the flags enum MFC_NOTIFY was introduced in kernel 2.1.68 but afaik it hasn't been used and I couldn't find any users currently so just remove it. Only MFC_STATIC is left, so move it into an enum, add a description and use BIT(). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 79aaa9fc1a15..fa66ebc1fed6 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -64,6 +64,13 @@ struct vif_device { #define VIFF_STATIC 0x8000 +/* mfc_flags: + * MFC_STATIC - the entry was added statically (not by a routing daemon) + */ +enum { + MFC_STATIC = BIT(0), +}; + struct mfc_cache { struct list_head list; __be32 mfc_mcastgrp; /* Group the entry belongs to */ @@ -89,9 +96,6 @@ struct mfc_cache { struct rcu_head rcu; }; -#define MFC_STATIC 1 -#define MFC_NOTIFY 2 - #define MFC_LINES 64 #ifdef __BIG_ENDIAN -- cgit v1.2.3 From 520191bb404c4b7b4cdb70a5480ada974b0c2d60 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:46 +0100 Subject: net: ipmr: adjust mroute.h style and drop extern Remove extra spaces and tabs, adjust function definitions, remove an unnecessary ifdef (already used below, just move code) and drop extern from the functions. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index fa66ebc1fed6..7c567a2679ce 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -9,38 +9,28 @@ #ifdef CONFIG_IP_MROUTE static inline int ip_mroute_opt(int opt) { - return (opt >= MRT_BASE) && (opt <= MRT_MAX); + return opt >= MRT_BASE && opt <= MRT_MAX; } -#else -static inline int ip_mroute_opt(int opt) -{ - return 0; -} -#endif -#ifdef CONFIG_IP_MROUTE -extern int ip_mroute_setsockopt(struct sock *, int, char __user *, unsigned int); -extern int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); -extern int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); -extern int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); -extern int ip_mr_init(void); +int ip_mroute_setsockopt(struct sock *, int, char __user *, unsigned int); +int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); +int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); +int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); +int ip_mr_init(void); #else -static inline -int ip_mroute_setsockopt(struct sock *sock, - int optname, char __user *optval, unsigned int optlen) +static inline int ip_mroute_setsockopt(struct sock *sock, int optname, + char __user *optval, unsigned int optlen) { return -ENOPROTOOPT; } -static inline -int ip_mroute_getsockopt(struct sock *sock, - int optname, char __user *optval, int __user *optlen) +static inline int ip_mroute_getsockopt(struct sock *sock, int optname, + char __user *optval, int __user *optlen) { return -ENOPROTOOPT; } -static inline -int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) +static inline int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) { return -ENOIOCTLCMD; } @@ -49,6 +39,11 @@ static inline int ip_mr_init(void) { return 0; } + +static inline int ip_mroute_opt(int opt) +{ + return 0; +} #endif struct vif_device { @@ -96,16 +91,16 @@ struct mfc_cache { struct rcu_head rcu; }; -#define MFC_LINES 64 +#define MFC_LINES 64 #ifdef __BIG_ENDIAN #define MFC_HASH(a,b) (((((__force u32)(__be32)a)>>24)^(((__force u32)(__be32)b)>>26))&(MFC_LINES-1)) #else #define MFC_HASH(a,b) ((((__force u32)(__be32)a)^(((__force u32)(__be32)b)>>2))&(MFC_LINES-1)) -#endif +#endif struct rtmsg; -extern int ipmr_get_route(struct net *net, struct sk_buff *skb, - __be32 saddr, __be32 daddr, - struct rtmsg *rtm, int nowait); +int ipmr_get_route(struct net *net, struct sk_buff *skb, + __be32 saddr, __be32 daddr, + struct rtmsg *rtm, int nowait); #endif -- cgit v1.2.3 From 5ea1f13299d8b8edcb2969eda4c81f8e3264b706 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:47 +0100 Subject: net: ipmr: move struct mr_table and VIF_EXISTS to mroute.h Move the definitions of VIF_EXISTS() and struct mr_table to mroute.h Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 7c567a2679ce..bf9b322cb0b0 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -59,6 +59,25 @@ struct vif_device { #define VIFF_STATIC 0x8000 +#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL) +#define MFC_LINES 64 + +struct mr_table { + struct list_head list; + possible_net_t net; + u32 id; + struct sock __rcu *mroute_sk; + struct timer_list ipmr_expire_timer; + struct list_head mfc_unres_queue; + struct list_head mfc_cache_array[MFC_LINES]; + struct vif_device vif_table[MAXVIFS]; + int maxvif; + atomic_t cache_resolve_queue_len; + bool mroute_do_assert; + bool mroute_do_pim; + int mroute_reg_vif_num; +}; + /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) */ @@ -91,8 +110,6 @@ struct mfc_cache { struct rcu_head rcu; }; -#define MFC_LINES 64 - #ifdef __BIG_ENDIAN #define MFC_HASH(a,b) (((((__force u32)(__be32)a)>>24)^(((__force u32)(__be32)b)>>26))&(MFC_LINES-1)) #else -- cgit v1.2.3 From 1973a4ea6ceaa47671227c3077f90508ea30897b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:48 +0100 Subject: net: ipmr: move pimsm_enabled to pim.h and rename Move the inline pimsm_enabled() to pim.h and rename it to ipmr_pimsm_enabled to show it's for the ipv4 ipmr code since pim.h is used by IPv6 too. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/pim.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/pim.h b/include/linux/pim.h index 252bf6644c51..e1d756f81348 100644 --- a/include/linux/pim.h +++ b/include/linux/pim.h @@ -13,6 +13,11 @@ #define PIM_NULL_REGISTER cpu_to_be32(0x40000000) +static inline bool ipmr_pimsm_enabled(void) +{ + return IS_BUILTIN(CONFIG_IP_PIMSM_V1) || IS_BUILTIN(CONFIG_IP_PIMSM_V2); +} + /* PIMv2 register message header layout (ietf-draft-idmr-pimvsm-v2-00.ps */ struct pimreghdr { -- cgit v1.2.3 From 91420b83baa046ada1a899c97f3b2c52a9045705 Mon Sep 17 00:00:00 2001 From: Sudarsana Kalluru Date: Mon, 30 Nov 2015 12:25:03 +0200 Subject: qed: Add support for changing LED state Physical LEDs are being controlled by the management FW. This adds the qed functionality required to request management FW to change the LED configuration, as well as the necessary APIs for this functionality to later be used by the protocol drivers. Signed-off-by: Sudarsana Kalluru Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index dc9a1353f971..d4a32e878180 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -25,6 +25,12 @@ #include #include +enum qed_led_mode { + QED_LED_MODE_OFF, + QED_LED_MODE_ON, + QED_LED_MODE_RESTORE +}; + #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \ (void __iomem *)(reg_addr)) @@ -252,6 +258,17 @@ struct qed_common_ops { void (*chain_free)(struct qed_dev *cdev, struct qed_chain *p_chain); + +/** + * @brief set_led - Configure LED mode + * + * @param cdev + * @param mode - LED mode + * + * @return 0 on success, error otherwise. + */ + int (*set_led)(struct qed_dev *cdev, + enum qed_led_mode mode); }; /** -- cgit v1.2.3 From c0eb454034aab783dc602739237a63b30867f5bd Mon Sep 17 00:00:00 2001 From: KY Srinivasan Date: Tue, 1 Dec 2015 16:43:10 -0800 Subject: hv_netvsc: Don't ask for additional head room in the skb The rndis header is 116 bytes big and can be placed in the default head room that will be available in the skb. Since the netvsc packet is less than 48 bytes, we can use the skb control buffer for the netvsc packet. With these changes we don't need to ask for additional head room. Signed-off-by: K. Y. Srinivasan Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7d2d1d7aaec7..fcbc5259c630 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -132,7 +132,9 @@ static inline bool dev_xmit_complete(int rc) * used. */ -#if defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) +#if defined(CONFIG_HYPERV_NET) +# define LL_MAX_HEADER 128 +#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) # if defined(CONFIG_MAC80211_MESH) # define LL_MAX_HEADER 128 # else -- cgit v1.2.3 From 7450aaf61f0ae2ee6cc6491138d11df2c25e7609 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Nov 2015 08:57:28 -0800 Subject: tcp: suppress too verbose messages in tcp_send_ack() If tcp_send_ack() can not allocate skb, we properly handle this and setup a timer to try later. Use __GFP_NOWARN to avoid polluting syslog in the case host is under memory pressure, so that pertinent messages are not lost under a flood of useless information. sk_gfp_atomic() can use its gfp_mask argument (all callers currently were using GFP_ATOMIC before this patch) We rename sk_gfp_atomic() to sk_gfp_mask() to clearly express this function now takes into account its second argument (gfp_mask) Note that when tcp_transmit_skb() is called with clone_it set to false, we do not attempt memory allocations, so can pass a 0 gfp_mask, which most compilers can emit faster than a non zero or constant value. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 62d35afcb3ac..9065f8b7e646 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -775,9 +775,9 @@ static inline int sk_memalloc_socks(void) #endif -static inline gfp_t sk_gfp_atomic(const struct sock *sk, gfp_t gfp_mask) +static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) { - return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC); + return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC); } static inline void sk_acceptq_removed(struct sock *sk) -- cgit v1.2.3 From c981e4213e9d2d4ec79501bd607722ec712742a2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:06 +0100 Subject: net: add netif_is_team_master helper Similar to other helpers, caller can use this to find out if device is team master. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fcbc5259c630..2b889be65d88 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1273,6 +1273,7 @@ struct net_device_ops { * @IFF_NO_QUEUE: device can run without qdisc attached * @IFF_OPENVSWITCH: device is a Open vSwitch master * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device + * @IFF_TEAM: device is a team device */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1299,6 +1300,7 @@ enum netdev_priv_flags { IFF_NO_QUEUE = 1<<21, IFF_OPENVSWITCH = 1<<22, IFF_L3MDEV_SLAVE = 1<<23, + IFF_TEAM = 1<<24, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1325,6 +1327,7 @@ enum netdev_priv_flags { #define IFF_NO_QUEUE IFF_NO_QUEUE #define IFF_OPENVSWITCH IFF_OPENVSWITCH #define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE +#define IFF_TEAM IFF_TEAM /** * struct net_device - The DEVICE structure. @@ -3889,6 +3892,11 @@ static inline bool netif_is_ovs_master(const struct net_device *dev) return dev->priv_flags & IFF_OPENVSWITCH; } +static inline bool netif_is_team_master(struct net_device *dev) +{ + return dev->priv_flags & IFF_TEAM; +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From f7f019ee6d117de5007d0b10e7960696bbf111eb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:07 +0100 Subject: net: add netif_is_team_port helper Similar to other helpers, caller can use this to find out if device is team port. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2b889be65d88..b3601f8a9b42 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3897,6 +3897,11 @@ static inline bool netif_is_team_master(struct net_device *dev) return dev->priv_flags & IFF_TEAM; } +static inline bool netif_is_team_port(struct net_device *dev) +{ + return dev->priv_flags & IFF_TEAM_PORT; +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From 7be61833042e7757745345eedc7b0efee240c189 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:08 +0100 Subject: net: add netif_is_lag_master helper Some code does not mind if the master is bond or team and treats them the same, as generic LAG. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b3601f8a9b42..3ca083efa560 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3902,6 +3902,11 @@ static inline bool netif_is_team_port(struct net_device *dev) return dev->priv_flags & IFF_TEAM_PORT; } +static inline bool netif_is_lag_master(struct net_device *dev) +{ + return netif_is_bond_master(dev) || netif_is_team_master(dev); +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From e0ba1414f310c83bf425fe26fa2cd5f1befcd6dc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:09 +0100 Subject: net: add netif_is_lag_port helper Some code does not mind if a device is bond slave or team port and treats them the same, as generic LAG ports. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ca083efa560..1506be58c59a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3907,6 +3907,11 @@ static inline bool netif_is_lag_master(struct net_device *dev) return netif_is_bond_master(dev) || netif_is_team_master(dev); } +static inline bool netif_is_lag_port(struct net_device *dev) +{ + return netif_is_bond_slave(dev) || netif_is_team_port(dev); +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From 6dffb0447c25476f499d205dfceb1972e8dae919 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:10 +0100 Subject: net: propagate upper priv via netdev_master_upper_dev_link Eliminate netdev_master_upper_dev_link_private and pass priv directly as a parameter of netdev_master_upper_dev_link. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1506be58c59a..939b8f3de810 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3619,10 +3619,8 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev); struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); int netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev); -int netdev_master_upper_dev_link_private(struct net_device *dev, - struct net_device *upper_dev, - void *private); + struct net_device *upper_dev, + void *upper_priv); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); -- cgit v1.2.3 From 29bf24afb29042f568fa67b1b0eee46796725ed2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:11 +0100 Subject: net: add possibility to pass information about upper device via notifier Sometimes the drivers and other code would find it handy to know some internal information about upper device being changed. So allow upper-code to pass information down to notifier listeners during linking. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 939b8f3de810..aea556c64f2c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2163,6 +2163,7 @@ struct netdev_notifier_changeupper_info { struct net_device *upper_dev; /* new upper dev */ bool master; /* is upper dev master */ bool linking; /* is the nofication for link or unlink */ + void *upper_info; /* upper dev info */ }; static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, @@ -3620,7 +3621,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv); + void *upper_priv, void *upper_info); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); -- cgit v1.2.3 From 764f5e544118508add420724789f46e04dba91eb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:12 +0100 Subject: net: add info struct for LAG changeupper This struct will be shared by bonding and team to pass internal information to notifier listeners. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index aea556c64f2c..3ab90ea0ed03 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2110,6 +2110,19 @@ struct pcpu_sw_netstats { #define netdev_alloc_pcpu_stats(type) \ __netdev_alloc_pcpu_stats(type, GFP_KERNEL); +enum netdev_lag_tx_type { + NETDEV_LAG_TX_TYPE_UNKNOWN, + NETDEV_LAG_TX_TYPE_RANDOM, + NETDEV_LAG_TX_TYPE_BROADCAST, + NETDEV_LAG_TX_TYPE_ROUNDROBIN, + NETDEV_LAG_TX_TYPE_ACTIVEBACKUP, + NETDEV_LAG_TX_TYPE_HASH, +}; + +struct netdev_lag_upper_info { + enum netdev_lag_tx_type tx_type; +}; + #include /* netdevice notifier chain. Please remember to update the rtnetlink -- cgit v1.2.3 From 8fd728566a354f7bc9cb6e781f185b8c39cf505b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:13 +0100 Subject: team: fill-up LAG changeupper info struct and pass it along Initialize netdev_lag_upper_info structure by TX type according to current team mode and pass it along via netdev_master_upper_dev_link. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_team.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index a6aa970758a2..b84e49c3a738 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -164,6 +164,7 @@ struct team_mode { size_t priv_size; size_t port_priv_size; const struct team_mode_ops *ops; + enum netdev_lag_tx_type lag_tx_type; }; #define TEAM_PORT_HASHBITS 4 -- cgit v1.2.3 From 04d482660a07039fc4e9a42bb3517db236d98f96 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:15 +0100 Subject: net: introduce change lower state notifier When lower device like bonding slave, team/bridge port, etc changes its state, it is useful for others to notice this change. Currently this is implemented specificly for bonding as NETDEV_BONDING_INFO notifier. This patch aims to replace this specific usage and make this more generic to be used for all upper-lower devices. Introduce NETDEV_CHANGELOWERSTATE netdev notifier type and netdev_lower_state_changed() helper. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ab90ea0ed03..ad69f237aa78 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2158,6 +2158,7 @@ struct netdev_lag_upper_info { #define NETDEV_CHANGEINFODATA 0x0018 #define NETDEV_BONDING_INFO 0x0019 #define NETDEV_PRECHANGEUPPER 0x001A +#define NETDEV_CHANGELOWERSTATE 0x001B int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); @@ -2179,6 +2180,11 @@ struct netdev_notifier_changeupper_info { void *upper_info; /* upper dev info */ }; +struct netdev_notifier_changelowerstate_info { + struct netdev_notifier_info info; /* must be first */ + void *lower_state_info; /* is lower dev state */ +}; + static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { @@ -3640,6 +3646,8 @@ void netdev_upper_dev_unlink(struct net_device *dev, void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); +void netdev_lower_state_changed(struct net_device *lower_dev, + void *lower_state_info); /* RSS keys are 40 or 52 bytes long */ #define NETDEV_RSS_KEY_LEN 52 -- cgit v1.2.3 From fb1b2e3ce53aef80b3cef71f3885d584cdbdc6b8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:16 +0100 Subject: net: introduce lower state changed info structure for LAG lowers This is shared info structure for bonding and team. Serves to pass down info about link state and port activity to notification listeners. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ad69f237aa78..fa84b59eb197 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2123,6 +2123,11 @@ struct netdev_lag_upper_info { enum netdev_lag_tx_type tx_type; }; +struct netdev_lag_lower_state_info { + u8 link_up : 1, + tx_enabled : 1; +}; + #include /* netdevice notifier chain. Please remember to update the rtnetlink -- cgit v1.2.3 From 5d397061ca2081d8a99e4bee5792122faa6aaf86 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:19 +0100 Subject: bonding: allow notifications for bond_set_slave_link_state Similar to state notifications. We allow caller to indicate if the notification should happen now or later, depending on if he holds rtnl mutex or not. Introduce bond_slave_link_notify function (similar to bond_slave_state_notify) which is later on called with rtnl mutex and goes over slaves and executes delayed notification. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/bonding.h | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index c1740a2794a3..1df437715e2f 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -165,7 +165,8 @@ struct slave { u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ - should_notify:1; /* indicateds whether the state changed */ + should_notify:1, /* indicates whether the state changed */ + should_notify_link:1; /* indicates whether the link changed */ u8 duplex; u32 original_mtu; u32 link_failure_count; @@ -504,10 +505,35 @@ static inline bool bond_is_slave_inactive(struct slave *slave) return slave->inactive; } -static inline void bond_set_slave_link_state(struct slave *slave, int state) +static inline void bond_set_slave_link_state(struct slave *slave, int state, + bool notify) { + if (slave->link == state) + return; + slave->link = state; - bond_queue_slave_event(slave); + if (notify) { + bond_queue_slave_event(slave); + slave->should_notify_link = 0; + } else { + if (slave->should_notify_link) + slave->should_notify_link = 0; + else + slave->should_notify_link = 1; + } +} + +static inline void bond_slave_link_notify(struct bonding *bond) +{ + struct list_head *iter; + struct slave *tmp; + + bond_for_each_slave(bond, tmp, iter) { + if (tmp->should_notify_link) { + bond_queue_slave_event(tmp); + tmp->should_notify_link = 0; + } + } } static inline __be32 bond_confirm_addr(struct net_device *dev, __be32 dst, __be32 local) -- cgit v1.2.3 From f7c7eb7f7af7f87e0fc150994785fd139576e43a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:20 +0100 Subject: bonding: implement lower state change propagation Let netdev notifier listeners know about link and slave state change. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/bonding.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index 1df437715e2f..ee6c52053aa3 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -247,6 +247,7 @@ struct bonding { ((struct slave *) rtnl_dereference(dev->rx_handler_data)) void bond_queue_slave_event(struct slave *slave); +void bond_lower_state_changed(struct slave *slave); struct bond_vlan_tag { __be16 vlan_proto; @@ -328,6 +329,7 @@ static inline void bond_set_active_slave(struct slave *slave) if (slave->backup) { slave->backup = 0; bond_queue_slave_event(slave); + bond_lower_state_changed(slave); rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); } } @@ -337,6 +339,7 @@ static inline void bond_set_backup_slave(struct slave *slave) if (!slave->backup) { slave->backup = 1; bond_queue_slave_event(slave); + bond_lower_state_changed(slave); rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); } } @@ -349,6 +352,7 @@ static inline void bond_set_slave_state(struct slave *slave, slave->backup = slave_state; if (notify) { + bond_lower_state_changed(slave); rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); bond_queue_slave_event(slave); slave->should_notify = 0; @@ -380,6 +384,7 @@ static inline void bond_slave_state_notify(struct bonding *bond) bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify) { + bond_lower_state_changed(tmp); rtmsg_ifinfo(RTM_NEWLINK, tmp->dev, 0, GFP_ATOMIC); tmp->should_notify = 0; } @@ -514,6 +519,7 @@ static inline void bond_set_slave_link_state(struct slave *slave, int state, slave->link = state; if (notify) { bond_queue_slave_event(slave); + bond_lower_state_changed(slave); slave->should_notify_link = 0; } else { if (slave->should_notify_link) @@ -531,6 +537,7 @@ static inline void bond_slave_link_notify(struct bonding *bond) bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify_link) { bond_queue_slave_event(tmp); + bond_lower_state_changed(tmp); tmp->should_notify_link = 0; } } -- cgit v1.2.3 From fc50db98ff872372f266695858f87a12eb1b4f05 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 1 Dec 2015 18:03:09 +0200 Subject: net/mlx5_core: Add base sriov support This patch adds SRIOV base support for mlx5 supported devices. The same driver is used for both PFs and VFs; VFs are identified by the driver through the flag MLX5_PCI_DEV_IS_VF added to the pci table entries. Virtual functions are created as usual through writing a value to the sriov_numvs sysfs file of the PF device. Upon instantiating VFs, they will all be probed by the driver on the hypervisor. One can gracefully unbind them through /sys/bus/pci/drivers/mlx5_core/unbind. mlx5_wait_for_vf_pages() was added to ensure that when a VF dies without executing proper teardown, the hypervisor driver waits till all of the pages that were allocated at the hypervisor to maintain its operation are returned. In order for the VF to be operational, the PF needs to call enable_hca for it. This can be done before the VFs are created through a call to pci_enable_sriov. If the there are VFs assigned to a VMs when the driver of the PF is unloaded, all the VF will experience system error and PF driver unloads cleanly; in this case pci_disable_sriov is not called and the devices will show when running lspci. Once the PF driver is reloaded, it will sync its data structures which maintain state on its VFs. Signed-off-by: Eli Cohen Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 24 ++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5c857f2a20d7..efebb87163c8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -426,6 +426,16 @@ struct mlx5_mr_table { struct radix_tree_root tree; }; +struct mlx5_vf_context { + int enabled; +}; + +struct mlx5_core_sriov { + struct mlx5_vf_context *vfs_ctx; + int num_vfs; + int enabled_vfs; +}; + struct mlx5_irq_info { cpumask_var_t mask; char name[MLX5_MAX_IRQ_NAME]; @@ -447,6 +457,7 @@ struct mlx5_priv { int fw_pages; atomic_t reg_pages; struct list_head free_list; + int vfs_pages; struct mlx5_core_health health; @@ -485,6 +496,8 @@ struct mlx5_priv { struct list_head dev_list; struct list_head ctx_list; spinlock_t ctx_lock; + struct mlx5_core_sriov sriov; + unsigned long pci_dev_data; }; enum mlx5_device_state { @@ -739,6 +752,8 @@ void mlx5_pagealloc_init(struct mlx5_core_dev *dev); void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev); int mlx5_pagealloc_start(struct mlx5_core_dev *dev); void mlx5_pagealloc_stop(struct mlx5_core_dev *dev); +int mlx5_sriov_init(struct mlx5_core_dev *dev); +int mlx5_sriov_cleanup(struct mlx5_core_dev *dev); void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id, s32 npages); int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot); @@ -884,6 +899,15 @@ struct mlx5_profile { } mr_cache[MAX_MR_CACHE_ENTRIES]; }; +enum { + MLX5_PCI_DEV_IS_VF = 1 << 0, +}; + +static inline int mlx5_core_is_pf(struct mlx5_core_dev *dev) +{ + return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF); +} + static inline int mlx5_get_gid_table_len(u16 param) { if (param > 4) { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1565324eb620..9b76fddd696b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -665,7 +665,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_17[0x1]; u8 ets[0x1]; u8 nic_flow_table[0x1]; - u8 reserved_18[0x4]; + u8 reserved_18_0; + u8 early_vf_enable; + u8 reserved_18[0x2]; u8 local_ca_ack_delay[0x5]; u8 reserved_19[0x6]; u8 port_type[0x2]; -- cgit v1.2.3 From 54f0a411ec72cb437d57d0c9654dcbd0f198ff3a Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:10 +0200 Subject: net/mlx5: Add HW capabilities and structs for SR-IOV E-Switch Update HCA capabilities and HW struct to include needed capabilities for upcoming Ethernet Switch (SR-IOV E-Switch). Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 9b76fddd696b..836cf0e43174 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -665,7 +665,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_17[0x1]; u8 ets[0x1]; u8 nic_flow_table[0x1]; - u8 reserved_18_0; + u8 eswitch_flow_table[0x1]; u8 early_vf_enable; u8 reserved_18[0x2]; u8 local_ca_ack_delay[0x5]; @@ -789,22 +789,30 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_60[0x1b]; u8 log_max_wq_sz[0x5]; - u8 reserved_61[0xa0]; - + u8 nic_vport_change_event[0x1]; + u8 reserved_61[0xa]; + u8 log_max_vlan_list[0x5]; u8 reserved_62[0x3]; + u8 log_max_current_mc_list[0x5]; + u8 reserved_63[0x3]; + u8 log_max_current_uc_list[0x5]; + + u8 reserved_64[0x80]; + + u8 reserved_65[0x3]; u8 log_max_l2_table[0x5]; - u8 reserved_63[0x8]; + u8 reserved_66[0x8]; u8 log_uar_page_sz[0x10]; - u8 reserved_64[0x100]; + u8 reserved_67[0xe0]; - u8 reserved_65[0x1f]; + u8 reserved_68[0x1f]; u8 cqe_zip[0x1]; u8 cqe_zip_timeout[0x10]; u8 cqe_zip_max_num[0x10]; - u8 reserved_66[0x220]; + u8 reserved_69[0x220]; }; enum { @@ -2135,10 +2143,6 @@ struct mlx5_ifc_rmpc_bits { struct mlx5_ifc_wq_bits wq; }; -enum { - MLX5_NIC_VPORT_CONTEXT_ALLOWED_LIST_TYPE_CURRENT_UC_MAC_ADDRESS = 0x0, -}; - struct mlx5_ifc_nic_vport_context_bits { u8 reserved_0[0x1f]; u8 roce_en[0x1]; -- cgit v1.2.3 From e1d7d349c69d12721c420f1fe673ce9aa462aadd Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:11 +0200 Subject: net/mlx5: Update access functions to Query/Modify vport MAC address In preparation for SR-IOV we add here an API to enable each e-switch client (PF/VF) to configure its L2 MAC addresses and for the e-switch manager (usually the PF) to access them in order to be able to configure them into the e-switch. Therefore we now pass vport num parameter to mlx5_query_nic_vport_context, so PF can access other vports contexts. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/vport.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 967e0fd06e89..43e82d9f5463 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -36,7 +36,10 @@ #include u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); -void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr); +int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, + u16 vport, u8 *addr); +int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, + u16 vport, u8 *addr); int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport, u8 port_num, u16 vf_num, u16 gid_index, union ib_gid *gid); -- cgit v1.2.3 From e16aea2744abea612c27ee0eef606c6a6a8204de Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:12 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport mac lists Those functions are needed to notify the upcoming L2 table and SR-IOV E-Switch(FDB) manager(PF), of the NIC vport (vf) UC/MC mac lists changes. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 6 ++++++ include/linux/mlx5/vport.h | 10 ++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0b473cbfa7ef..0d2f0435a9f0 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1102,6 +1102,12 @@ enum { MLX5_FLOW_CONTEXT_DEST_TYPE_TIR = 2, }; +enum mlx5_list_type { + MLX5_NVPRT_LIST_TYPE_UC = 0x0, + MLX5_NVPRT_LIST_TYPE_MC = 0x1, + MLX5_NVPRT_LIST_TYPE_VLAN = 0x2, +}; + enum { MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE = 0x0, MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM = 0x1, diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 43e82d9f5463..00bbec8d9527 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -34,6 +34,7 @@ #define __MLX5_VPORT_H__ #include +#include u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, @@ -54,5 +55,14 @@ int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev, u64 *sys_image_guid); int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev, u64 *node_guid); +int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev, + u32 vport, + enum mlx5_list_type list_type, + u8 addr_list[][ETH_ALEN], + int *list_size); +int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, + enum mlx5_list_type list_type, + u8 addr_list[][ETH_ALEN], + int list_size); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From e75465148b7df7f2796c75bf98bf33f171edeb2b Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:13 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport state In preparation for SR-IOV we add here an API to enable each e-switch manager (PF) to configure its VFs link states in e-switch preparation for ethernet sriov. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 1 + include/linux/mlx5/vport.h | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 836cf0e43174..655184702ea2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2946,6 +2946,7 @@ struct mlx5_ifc_query_vport_state_out_bits { enum { MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT = 0x0, + MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT = 0x1, }; struct mlx5_ifc_query_vport_state_in_bits { diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 00bbec8d9527..c1bba5948851 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -36,7 +36,11 @@ #include #include -u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); +u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); +u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport); +int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 state); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, u8 *addr); int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, -- cgit v1.2.3 From d82b73186dab70d6d332dd2afdb48608be2e5230 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:14 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport promisc mode Those functions are needed to notify the upcoming SR-IOV E-Switch(FDB) manager(PF), of the NIC vport (vf) promisc mode changes. Preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 28 +++++++++++++++++++++++----- include/linux/mlx5/vport.h | 9 +++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 655184702ea2..2728b5f6c017 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2147,16 +2147,31 @@ struct mlx5_ifc_nic_vport_context_bits { u8 reserved_0[0x1f]; u8 roce_en[0x1]; - u8 reserved_1[0x760]; + u8 arm_change_event[0x1]; + u8 reserved_1[0x1a]; + u8 event_on_mtu[0x1]; + u8 event_on_promisc_change[0x1]; + u8 event_on_vlan_change[0x1]; + u8 event_on_mc_address_change[0x1]; + u8 event_on_uc_address_change[0x1]; - u8 reserved_2[0x5]; + u8 reserved_2[0xf0]; + + u8 mtu[0x10]; + + u8 reserved_3[0x640]; + + u8 promisc_uc[0x1]; + u8 promisc_mc[0x1]; + u8 promisc_all[0x1]; + u8 reserved_4[0x2]; u8 allowed_list_type[0x3]; - u8 reserved_3[0xc]; + u8 reserved_5[0xc]; u8 allowed_list_size[0xc]; struct mlx5_ifc_mac_address_layout_bits permanent_address; - u8 reserved_4[0x20]; + u8 reserved_6[0x20]; u8 current_uc_mac_address[0][0x40]; }; @@ -4235,7 +4250,10 @@ struct mlx5_ifc_modify_nic_vport_context_out_bits { }; struct mlx5_ifc_modify_nic_vport_field_select_bits { - u8 reserved_0[0x1c]; + u8 reserved_0[0x19]; + u8 mtu[0x1]; + u8 change_event[0x1]; + u8 promisc[0x1]; u8 permanent_address[0x1]; u8 addresses_list[0x1]; u8 roce_en[0x1]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index c1bba5948851..dbbaed9f975a 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -68,5 +68,14 @@ int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, enum mlx5_list_type list_type, u8 addr_list[][ETH_ALEN], int list_size); +int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev, + u32 vport, + int *promisc_uc, + int *promisc_mc, + int *promisc_all); +int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, + int promisc_uc, + int promisc_mc, + int promisc_all); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From c0046cf7b81ac55b8bf056c71918ec04edd99379 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:15 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport vlans Those functions are needed to notify the upcoming L2 table and SR-IOV E-Switch(FDB) manager(PF), of the NIC vport (vf) vlan table changes. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 7 +++++++ include/linux/mlx5/vport.h | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2728b5f6c017..39487d0c305d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -910,6 +910,13 @@ struct mlx5_ifc_mac_address_layout_bits { u8 mac_addr_31_0[0x20]; }; +struct mlx5_ifc_vlan_layout_bits { + u8 reserved_0[0x14]; + u8 vlan[0x0c]; + + u8 reserved_1[0x20]; +}; + struct mlx5_ifc_cong_control_r_roce_ecn_np_bits { u8 reserved_0[0xa0]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index dbbaed9f975a..638f2ca7a527 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -77,5 +77,12 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, int promisc_uc, int promisc_mc, int promisc_all); +int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev, + u32 vport, + u16 vlans[], + int *size); +int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, + u16 vlans[], + int list_size); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From 073bb189a41d7bbad509b576a690611c46c4858f Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:18 +0200 Subject: net/mlx5: Introducing E-Switch and l2 table E-Switch is the software entity that represents and manages ConnectX4 inter-HCA ethernet l2 switching. E-Switch has its own Virtual Ports, each Vport/vNIC/VF can be connected to the device through a vport of an e-switch. Each e-switch is managed by one vNIC identified by HCA_CAP.vport_group_manager (usually it is the PF/vport[0]), and its main responsibility is to forward each packet to the right vport. e-Switch needs to manage its own l2-table and FDB tables. L2 table is a flow table that is managed by FW, it is needed for Multi-host (Multi PF) configuration for inter HCA switching between PFs. FDB table is a flow table that is totally managed by e-Switch driver, its main responsibility is to switch packets between e-Swtich internal vports and uplink vport that belong to the same. This patch introduces only e-Swtich l2 table management, FDB managemnt will come later when ethernet SRIOV/VFs will be enabled. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 8 ++++++++ include/linux/mlx5/driver.h | 4 ++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0d2f0435a9f0..90a4cb6dc4cb 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -251,6 +251,7 @@ enum mlx5_event { MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, MLX5_EVENT_TYPE_PAGE_FAULT = 0xc, + MLX5_EVENT_TYPE_NIC_VPORT_CHANGE = 0xd, }; enum { @@ -520,6 +521,12 @@ struct mlx5_eqe_page_fault { __be32 flags_qpn; } __packed; +struct mlx5_eqe_vport_change { + u8 rsvd0[2]; + __be16 vport_num; + __be32 rsvd1[6]; +} __packed; + union ev_data { __be32 raw[7]; struct mlx5_eqe_cmd cmd; @@ -532,6 +539,7 @@ union ev_data { struct mlx5_eqe_stall_vl stall_vl; struct mlx5_eqe_page_req req_pages; struct mlx5_eqe_page_fault page_fault; + struct mlx5_eqe_vport_change vport_change; } __packed; struct mlx5_eqe { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index efebb87163c8..ac098b6b97bf 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -441,6 +441,8 @@ struct mlx5_irq_info { char name[MLX5_MAX_IRQ_NAME]; }; +struct mlx5_eswitch; + struct mlx5_priv { char name[MLX5_MAX_NAME_LEN]; struct mlx5_eq_table eq_table; @@ -496,6 +498,8 @@ struct mlx5_priv { struct list_head dev_list; struct list_head ctx_list; spinlock_t ctx_lock; + + struct mlx5_eswitch *eswitch; struct mlx5_core_sriov sriov; unsigned long pci_dev_data; }; -- cgit v1.2.3 From 495716b191f607b2cb2175f7499966daef79f663 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:19 +0200 Subject: net/mlx5: E-Switch, Introduce FDB hardware capabilities Define needed hardware structures and capabilities needed for E-Switch FDB flow tables and read them on driver load. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 15 +++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 13 +++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 90a4cb6dc4cb..bce9caed1eed 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1138,6 +1138,7 @@ enum mlx5_cap_type { MLX5_CAP_IPOIB_OFFLOADS, MLX5_CAP_EOIB_OFFLOADS, MLX5_CAP_FLOW_TABLE, + MLX5_CAP_ESWITCH_FLOW_TABLE, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1175,6 +1176,20 @@ enum mlx5_cap_type { #define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \ MLX5_GET(flow_table_nic_cap, mdev->hca_caps_max[MLX5_CAP_FLOW_TABLE], cap) +#define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \ + MLX5_GET(flow_table_eswitch_cap, \ + mdev->hca_caps_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) + +#define MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, cap) \ + MLX5_GET(flow_table_eswitch_cap, \ + mdev->hca_caps_max[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) + +#define MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) \ + MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_nic_esw_fdb.cap) + +#define MLX5_CAP_ESW_FLOWTABLE_FDB_MAX(mdev, cap) \ + MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_nic_esw_fdb.cap) + #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 39487d0c305d..ae7c08adba4a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -447,6 +447,18 @@ struct mlx5_ifc_flow_table_nic_cap_bits { u8 reserved_3[0x7200]; }; +struct mlx5_ifc_flow_table_eswitch_cap_bits { + u8 reserved_0[0x200]; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_ingress; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress; + + u8 reserved_1[0x7800]; +}; + struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 csum_cap[0x1]; u8 vlan_cap[0x1]; @@ -1846,6 +1858,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_roce_cap_bits roce_cap; struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps; struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; + struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; u8 reserved_0[0x8000]; }; -- cgit v1.2.3 From 81848731ff4070a3e4136efa6a99d507177a53fe Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:20 +0200 Subject: net/mlx5: E-Switch, Add SR-IOV (FDB) support Enabling E-Switch SRIOV for nvfs+1 vports. Create E-Switch FDB for L2 UC/MC mac steering between VFs/PF and external vport (Uplink). FDB contains forwarding rules such as: UC MAC0 -> vport0(PF). UC MAC1 -> vport1. UC MAC2 -> vport2. MC MACX -> vport0, vport2, Uplink. MC MACY -> vport1, Uplink. For unmatched traffic FDB has the following default rules: Unmached Traffic (src vport != Uplink) -> Uplink. Unmached Traffic (src vport == Uplink) -> vport0(PF). FDB rules population: Each NIC vport (VF) will notify E-Switch manager of its UC/MC vport context changes via modify vport context command, which will be translated to an event that will be handled by E-Switch manager (PF) which will update FDB table accordingly. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 6 ++++++ include/linux/mlx5/flow_table.h | 9 +++++++++ include/linux/mlx5/mlx5_ifc.h | 7 ++++--- 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index bce9caed1eed..88eb4490a8b3 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1074,6 +1074,12 @@ enum { VPORT_STATE_UP = 0x1, }; +enum { + MLX5_ESW_VPORT_ADMIN_STATE_DOWN = 0x0, + MLX5_ESW_VPORT_ADMIN_STATE_UP = 0x1, + MLX5_ESW_VPORT_ADMIN_STATE_AUTO = 0x2, +}; + enum { MLX5_L3_PROT_TYPE_IPV4 = 0, MLX5_L3_PROT_TYPE_IPV6 = 1, diff --git a/include/linux/mlx5/flow_table.h b/include/linux/mlx5/flow_table.h index 5f922c6d4fc2..0f2a15cf3317 100644 --- a/include/linux/mlx5/flow_table.h +++ b/include/linux/mlx5/flow_table.h @@ -41,6 +41,15 @@ struct mlx5_flow_table_group { u32 match_criteria[MLX5_ST_SZ_DW(fte_match_param)]; }; +struct mlx5_flow_destination { + enum mlx5_flow_destination_type type; + union { + u32 tir_num; + void *ft; + u32 vport_num; + }; +}; + void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type, u16 num_groups, struct mlx5_flow_table_group *group); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ae7c08adba4a..a81b008738fd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -827,9 +827,10 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_69[0x220]; }; -enum { - MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_FLOW_TABLE_ = 0x1, - MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_TIR = 0x2, +enum mlx5_flow_destination_type { + MLX5_FLOW_DESTINATION_TYPE_VPORT = 0x0, + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, + MLX5_FLOW_DESTINATION_TYPE_TIR = 0x2, }; struct mlx5_ifc_dest_format_struct_bits { -- cgit v1.2.3 From d6666753c6e85834f1669c7b831cc2b7fc9e4390 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:22 +0200 Subject: net/mlx5: E-Switch, Introduce HCA cap and E-Switch vport context E-Switch vport context is unlike NIC vport context, managed by the E-Switch manager or vport_group_manager and not by the NIC(VF) driver. The E-Switch manager can access (read/modify) any of its vports E-Switch context. Currently E-Switch vport context includes only clietnt and server vlan insertion and striping data (for later support of VST mode). Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 9 +++++ include/linux/mlx5/mlx5_ifc.h | 90 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 88eb4490a8b3..7d3a85faefb7 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1145,6 +1145,7 @@ enum mlx5_cap_type { MLX5_CAP_EOIB_OFFLOADS, MLX5_CAP_FLOW_TABLE, MLX5_CAP_ESWITCH_FLOW_TABLE, + MLX5_CAP_ESWITCH, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1196,6 +1197,14 @@ enum mlx5_cap_type { #define MLX5_CAP_ESW_FLOWTABLE_FDB_MAX(mdev, cap) \ MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_nic_esw_fdb.cap) +#define MLX5_CAP_ESW(mdev, cap) \ + MLX5_GET(e_switch_cap, \ + mdev->hca_caps_cur[MLX5_CAP_ESWITCH], cap) + +#define MLX5_CAP_ESW_MAX(mdev, cap) \ + MLX5_GET(e_switch_cap, \ + mdev->hca_caps_max[MLX5_CAP_ESWITCH], cap) + #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a81b008738fd..f5d94495758a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -459,6 +459,17 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits { u8 reserved_1[0x7800]; }; +struct mlx5_ifc_e_switch_cap_bits { + u8 vport_svlan_strip[0x1]; + u8 vport_cvlan_strip[0x1]; + u8 vport_svlan_insert[0x1]; + u8 vport_cvlan_insert_if_not_exist[0x1]; + u8 vport_cvlan_insert_overwrite[0x1]; + u8 reserved_0[0x1b]; + + u8 reserved_1[0x7e0]; +}; + struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 csum_cap[0x1]; u8 vlan_cap[0x1]; @@ -1860,6 +1871,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps; struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; + struct mlx5_ifc_e_switch_cap_bits e_switch_cap; u8 reserved_0[0x8000]; }; @@ -2305,6 +2317,26 @@ struct mlx5_ifc_hca_vport_context_bits { u8 reserved_6[0xca0]; }; +struct mlx5_ifc_esw_vport_context_bits { + u8 reserved_0[0x3]; + u8 vport_svlan_strip[0x1]; + u8 vport_cvlan_strip[0x1]; + u8 vport_svlan_insert[0x1]; + u8 vport_cvlan_insert[0x2]; + u8 reserved_1[0x18]; + + u8 reserved_2[0x20]; + + u8 svlan_cfi[0x1]; + u8 svlan_pcp[0x3]; + u8 svlan_id[0xc]; + u8 cvlan_cfi[0x1]; + u8 cvlan_pcp[0x3]; + u8 cvlan_id[0xc]; + + u8 reserved_3[0x7a0]; +}; + enum { MLX5_EQC_STATUS_OK = 0x0, MLX5_EQC_STATUS_EQ_WRITE_FAILURE = 0xa, @@ -3743,6 +3775,64 @@ struct mlx5_ifc_query_flow_group_in_bits { u8 reserved_5[0x120]; }; +struct mlx5_ifc_query_esw_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_esw_vport_context_bits esw_vport_context; +}; + +struct mlx5_ifc_query_esw_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_modify_esw_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_esw_vport_context_fields_select_bits { + u8 reserved[0x1c]; + u8 vport_cvlan_insert[0x1]; + u8 vport_svlan_insert[0x1]; + u8 vport_cvlan_strip[0x1]; + u8 vport_svlan_strip[0x1]; +}; + +struct mlx5_ifc_modify_esw_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + struct mlx5_ifc_esw_vport_context_fields_select_bits field_select; + + struct mlx5_ifc_esw_vport_context_bits esw_vport_context; +}; + struct mlx5_ifc_query_eq_out_bits { u8 status[0x8]; u8 reserved_0[0x18]; -- cgit v1.2.3 From 2d1e0254ef8310e4f0756130a7ffc007ad1d58df Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 1 Dec 2015 14:55:21 +0000 Subject: pci_ids: add Netronome Systems vendor Add PCI vendor id for Netronome Systems. Signed-off-by: Jakub Kicinski Signed-off-by: Rolf Neugebauer Signed-off-by: David S. Miller --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d9ba49cedc5d..1acbefc4bbda 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2495,6 +2495,8 @@ #define PCI_DEVICE_ID_KORENIX_JETCARDF2 0x1700 #define PCI_DEVICE_ID_KORENIX_JETCARDF3 0x17ff +#define PCI_VENDOR_ID_NETRONOME 0x19ee + #define PCI_VENDOR_ID_QMI 0x1a32 #define PCI_VENDOR_ID_AZWAVE 0x1a3b -- cgit v1.2.3 From 357ab2234d57f6c74386f64ded42dff8e3c0500b Mon Sep 17 00:00:00 2001 From: Asias He Date: Wed, 2 Dec 2015 14:43:59 +0800 Subject: VSOCK: Introduce vsock_find_unbound_socket and vsock_bind_dgram_generic Signed-off-by: Asias He Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/net/af_vsock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index e9eb2d6791b3..a0c8fa2ababf 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -175,8 +175,10 @@ void vsock_insert_connected(struct vsock_sock *vsk); void vsock_remove_bound(struct vsock_sock *vsk); void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); +struct sock *vsock_find_unbound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); +int vsock_bind_dgram_generic(struct vsock_sock *vsk, struct sockaddr_vm *addr); #endif /* __AF_VSOCK_H__ */ -- cgit v1.2.3 From 80a19e338d458abb5a700df3fd00795c51361f06 Mon Sep 17 00:00:00 2001 From: Asias He Date: Wed, 2 Dec 2015 14:44:00 +0800 Subject: VSOCK: Introduce virtio-vsock-common.ko This module contains the common code and header files for the following virtio-vsock and virtio-vhost kernel modules. Signed-off-by: Asias He Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 209 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_ids.h | 1 + include/uapi/linux/virtio_vsock.h | 89 ++++++++++++++++ 3 files changed, 299 insertions(+) create mode 100644 include/linux/virtio_vsock.h create mode 100644 include/uapi/linux/virtio_vsock.h (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h new file mode 100644 index 000000000000..a5f3ecc038f7 --- /dev/null +++ b/include/linux/virtio_vsock.h @@ -0,0 +1,209 @@ +/* + * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so + * anyone can use the definitions to implement compatible drivers/servers: + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) Red Hat, Inc., 2013-2015 + * Copyright (C) Asias He , 2013 + * Copyright (C) Stefan Hajnoczi , 2015 + */ + +#ifndef _LINUX_VIRTIO_VSOCK_H +#define _LINUX_VIRTIO_VSOCK_H + +#include +#include +#include + +#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 +#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) +#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) +#define VIRTIO_VSOCK_MAX_TX_BUF_SIZE (1024 * 1024 * 16) +#define VIRTIO_VSOCK_MAX_DGRAM_SIZE (1024 * 64) + +struct vsock_transport_recv_notify_data; +struct vsock_transport_send_notify_data; +struct sockaddr_vm; +struct vsock_sock; + +enum { + VSOCK_VQ_CTRL = 0, + VSOCK_VQ_RX = 1, /* for host to guest data */ + VSOCK_VQ_TX = 2, /* for guest to host data */ + VSOCK_VQ_MAX = 3, +}; + +/* virtio transport socket state */ +struct virtio_transport { + struct virtio_transport_pkt_ops *ops; + struct vsock_sock *vsk; + + u32 buf_size; + u32 buf_size_min; + u32 buf_size_max; + + struct mutex tx_lock; + struct mutex rx_lock; + + struct list_head rx_queue; + u32 rx_bytes; + + /* Protected by trans->tx_lock */ + u32 tx_cnt; + u32 buf_alloc; + u32 peer_fwd_cnt; + u32 peer_buf_alloc; + /* Protected by trans->rx_lock */ + u32 fwd_cnt; + + /* Protected by sk_lock */ + u16 dgram_id; + struct list_head incomplete_dgrams; /* dgram fragments */ +}; + +struct virtio_vsock_pkt { + struct virtio_vsock_hdr hdr; + struct virtio_transport *trans; + struct work_struct work; + struct list_head list; + void *buf; + u32 len; + u32 off; +}; + +struct virtio_vsock_pkt_info { + u32 remote_cid, remote_port; + struct msghdr *msg; + u32 pkt_len; + u16 type; + u16 op; + u32 flags; + u16 dgram_id; + u16 dgram_len; +}; + +struct virtio_transport_pkt_ops { + int (*send_pkt)(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info); +}; + +void virtio_vsock_dumppkt(const char *func, + const struct virtio_vsock_pkt *pkt); + +struct sock * +virtio_transport_get_pending(struct sock *listener, + struct virtio_vsock_pkt *pkt); +struct virtio_vsock_pkt * +virtio_transport_alloc_pkt(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info, + size_t len, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port); +ssize_t +virtio_transport_stream_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, + int type); +int +virtio_transport_dgram_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags); + +s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); +s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); + +int virtio_transport_do_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk); +u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); +void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); +int +virtio_transport_notify_poll_in(struct vsock_sock *vsk, + size_t target, + bool *data_ready_now); +int +virtio_transport_notify_poll_out(struct vsock_sock *vsk, + size_t target, + bool *space_available_now); + +int virtio_transport_notify_recv_init(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, + size_t target, ssize_t copied, bool data_read, + struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_send_init(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, + ssize_t written, struct vsock_transport_send_notify_data *data); + +u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); +bool virtio_transport_stream_is_active(struct vsock_sock *vsk); +bool virtio_transport_stream_allow(u32 cid, u32 port); +int virtio_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr); +bool virtio_transport_dgram_allow(u32 cid, u32 port); + +int virtio_transport_connect(struct vsock_sock *vsk); + +int virtio_transport_shutdown(struct vsock_sock *vsk, int mode); + +void virtio_transport_release(struct vsock_sock *vsk); + +ssize_t +virtio_transport_stream_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len); +int +virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct msghdr *msg, + size_t len); + +void virtio_transport_destruct(struct vsock_sock *vsk); + +void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt); +u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 wanted); +void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit); +#endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 77925f587b15..16dcf5d06cd7 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -39,6 +39,7 @@ #define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ #define VIRTIO_ID_CAIF 12 /* Virtio caif */ +#define VIRTIO_ID_VSOCK 13 /* virtio vsock transport */ #define VIRTIO_ID_GPU 16 /* virtio GPU */ #define VIRTIO_ID_INPUT 18 /* virtio input */ diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h new file mode 100644 index 000000000000..8cf9b5682628 --- /dev/null +++ b/include/uapi/linux/virtio_vsock.h @@ -0,0 +1,89 @@ +/* + * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so + * anyone can use the definitions to implement compatible drivers/servers: + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) Red Hat, Inc., 2013-2015 + * Copyright (C) Asias He , 2013 + * Copyright (C) Stefan Hajnoczi , 2015 + */ + +#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H +#define _UAPI_LINUX_VIRTIO_VOSCK_H + +#include +#include +#include + +struct virtio_vsock_config { + __le32 guest_cid; + __le32 max_virtqueue_pairs; +}; + +struct virtio_vsock_hdr { + __le32 src_cid; + __le32 src_port; + __le32 dst_cid; + __le32 dst_port; + __le32 len; + __le16 type; /* enum virtio_vsock_type */ + __le16 op; /* enum virtio_vsock_op */ + __le32 flags; + __le32 buf_alloc; + __le32 fwd_cnt; +}; + +enum virtio_vsock_type { + VIRTIO_VSOCK_TYPE_STREAM = 1, + VIRTIO_VSOCK_TYPE_DGRAM = 2, +}; + +enum virtio_vsock_op { + VIRTIO_VSOCK_OP_INVALID = 0, + + /* Connect operations */ + VIRTIO_VSOCK_OP_REQUEST = 1, + VIRTIO_VSOCK_OP_RESPONSE = 2, + VIRTIO_VSOCK_OP_ACK = 3, + VIRTIO_VSOCK_OP_RST = 4, + VIRTIO_VSOCK_OP_SHUTDOWN = 5, + + /* To send payload */ + VIRTIO_VSOCK_OP_RW = 6, + + /* Tell the peer our credit info */ + VIRTIO_VSOCK_OP_CREDIT_UPDATE = 7, + /* Request the peer to send the credit info to us */ + VIRTIO_VSOCK_OP_CREDIT_REQUEST = 8, +}; + +/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */ +enum virtio_vsock_shutdown { + VIRTIO_VSOCK_SHUTDOWN_RCV = 1, + VIRTIO_VSOCK_SHUTDOWN_SEND = 2, +}; + +#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */ -- cgit v1.2.3 From 3110489117581a980537b6d999a3724214ba772c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 22 Oct 2015 17:35:19 +0200 Subject: mac80211: allow driver to prevent two stations w/ same address Some devices or drivers cannot deal with having the same station address for different virtual interfaces, say as a client to two virtual AP interfaces. Rather than requiring each driver with a limitation like that to enforce it, add a hardware flag for it. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 760bc4d5a2cf..8628118214cc 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1901,6 +1901,11 @@ struct ieee80211_txq { * @IEEE80211_HW_BEACON_TX_STATUS: The device/driver provides TX status * for sent beacons. * + * @IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR: Hardware (or driver) requires that each + * station has a unique address, i.e. each station entry can be identified + * by just its MAC address; this prevents, for example, the same station + * from connecting to two virtual AP interfaces at the same time. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -1936,6 +1941,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_TDLS_WIDER_BW, IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU, IEEE80211_HW_BEACON_TX_STATUS, + IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From 0483eeac59876ac37d4edbabd48727a468416d5b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 23 Oct 2015 09:50:03 +0200 Subject: cfg80211: replace ieee80211_ie_split() with an inline The function is a very simple wrapper around another one, just adds a few default parameters, so replace it with a static inline instead of using EXPORT_SYMBOL, reducing the module size slightly. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2c7bdb81d30c..e568872203a5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5173,8 +5173,11 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, * buffer starts, which may be @ielen if the entire (remainder) * of the buffer should be used. */ -size_t ieee80211_ie_split(const u8 *ies, size_t ielen, - const u8 *ids, int n_ids, size_t offset); +static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen, + const u8 *ids, int n_ids, size_t offset) +{ + return ieee80211_ie_split_ric(ies, ielen, ids, n_ids, NULL, 0, offset); +} /** * cfg80211_report_wowlan_wakeup - report wakeup from WoWLAN -- cgit v1.2.3 From cf595922b9a3b61e308224fd29909f54ecb557d4 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Mon, 9 Nov 2015 12:17:37 +0200 Subject: nl80211: clarify NL80211_ATTR_SCHED_SCAN_DELAY usage with net-detect In this attribute's documentation, it was not clear whether the delay started counting when WoWLAN net-detect was enabled or when the system was suspended. The correct answer is that it starts when the system suspends (which is when, in practice, the scan is scheduled). Clarify that in the nl80211.h documentation. Suggested-by: Samuel Tan Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1f0b4cf5dd03..07099cb14778 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1764,8 +1764,9 @@ enum nl80211_commands { * over all channels. * * @NL80211_ATTR_SCHED_SCAN_DELAY: delay before the first cycle of a - * scheduled scan (or a WoWLAN net-detect scan) is started, u32 - * in seconds. + * scheduled scan is started. Or the delay before a WoWLAN + * net-detect scan is started, counting from the moment the + * system is suspended. This value is a u32, in seconds. * @NL80211_ATTR_REG_INDOOR: flag attribute, if set indicates that the device * is operating in an indoor environment. -- cgit v1.2.3 From 0ead2510f8cec11ce96308d79a1b4ee272fb5238 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Tue, 17 Nov 2015 10:24:36 +0200 Subject: mac80211: allow the driver to send EOSP when needed This can happen when the driver needs to send less frames than expected and then needs to close the SP. Mac80211 still needs to set the more_data properly based on its buffer state (ps_tx_buffer and buffered frames on other TIDs). To that end, refactor the code that delivers frames upon uAPSD trigger frames to be able to get only the more_data bit without actually delivering those frames in case the driver is just asking to set a NDP with EOSP and MORE_DATA bit properly set. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 8628118214cc..18ac733afc91 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4868,6 +4868,28 @@ void ieee80211_sta_block_awake(struct ieee80211_hw *hw, */ void ieee80211_sta_eosp(struct ieee80211_sta *pubsta); +/** + * ieee80211_send_eosp_nullfunc - ask mac80211 to send NDP with EOSP + * @pubsta: the station + * @tid: the tid of the NDP + * + * Sometimes the device understands that it needs to close + * the Service Period unexpectedly. This can happen when + * sending frames that are filling holes in the BA window. + * In this case, the device can ask mac80211 to send a + * Nullfunc frame with EOSP set. When that happens, the + * driver must have called ieee80211_sta_set_buffered() to + * let mac80211 know that there are no buffered frames any + * more, otherwise mac80211 will get the more_data bit wrong. + * The low level driver must have made sure that the frame + * will be sent despite the station being in power-save. + * Mac80211 won't call allow_buffered_frames(). + * Note that calling this function, doesn't exempt the driver + * from closing the EOSP properly, it will still have to call + * ieee80211_sta_eosp when the NDP is sent. + */ +void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid); + /** * ieee80211_iter_keys - iterate keys programmed into the device * @hw: pointer obtained from ieee80211_alloc_hw() -- cgit v1.2.3 From ef044763a3ca6b9e0bb65a9ce0cb38c0eca62756 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Tue, 17 Nov 2015 10:24:37 +0200 Subject: mac80211: add atomic uploaded keys iterator add ieee80211_iter_keys_rcu() to iterate over uploaded keys in atomic context (when rcu is locked) The station removal code removes the keys only after calling synchronize_net(), so it's not safe to iterate the keys at this point (and postponing the actual key deletion with call_rcu() might result in some badly-ordered ops calls). Add a flag to indicate a station is being removed, and skip the configured keys if it's set. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 18ac733afc91..a68051c41ac3 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4917,6 +4917,30 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw, void *data), void *iter_data); +/** + * ieee80211_iter_keys_rcu - iterate keys programmed into the device + * @hw: pointer obtained from ieee80211_alloc_hw() + * @vif: virtual interface to iterate, may be %NULL for all + * @iter: iterator function that will be called for each key + * @iter_data: custom data to pass to the iterator function + * + * This function can be used to iterate all the keys known to + * mac80211, even those that weren't previously programmed into + * the device. Note that due to locking reasons, keys of station + * in removal process will be skipped. + * + * This function requires being called in an RCU critical section, + * and thus iter must be atomic. + */ +void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + void (*iter)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + struct ieee80211_key_conf *key, + void *data), + void *iter_data); + /** * ieee80211_iter_chan_contexts_atomic - iterate channel contexts * @hw: pointre obtained from ieee80211_alloc_hw(). -- cgit v1.2.3 From b115b972997428b9134aba377721fea6486adbd0 Mon Sep 17 00:00:00 2001 From: "Janusz.Dziedzic@tieto.com" Date: Tue, 27 Oct 2015 08:38:40 +0100 Subject: mac80211: add new IEEE80211_VIF_GET_NOA_UPDATE flag Add new VIF flag, that will allow get NOA update notification when driver will request this, even this is not pure P2P vif (eg. STA vif). Signed-off-by: Janusz Dziedzic Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a68051c41ac3..7c30faff245f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1321,11 +1321,15 @@ struct ieee80211_channel_switch { * interface. This flag should be set during interface addition, * but may be set/cleared as late as authentication to an AP. It is * only valid for managed/station mode interfaces. + * @IEEE80211_VIF_GET_NOA_UPDATE: request to handle NOA attributes + * and send P2P_PS notification to the driver if NOA changed, even + * this is not pure P2P vif. */ enum ieee80211_vif_flags { IEEE80211_VIF_BEACON_FILTER = BIT(0), IEEE80211_VIF_SUPPORTS_CQM_RSSI = BIT(1), IEEE80211_VIF_SUPPORTS_UAPSD = BIT(2), + IEEE80211_VIF_GET_NOA_UPDATE = BIT(3), }; /** -- cgit v1.2.3 From 91d3ab46730379e89e1e908c6f62fbcadb3d8f08 Mon Sep 17 00:00:00 2001 From: Vidyullatha Kanchanapally Date: Fri, 30 Oct 2015 19:14:49 +0530 Subject: cfg80211: Add support for aborting an ongoing scan Implement new functionality for aborting an ongoing scan. Add NL80211_CMD_ABORT_SCAN to the nl80211 interface. After aborting the scan, driver shall provide the scan status by calling cfg80211_scan_done(). Reviewed-by: Jouni Malinen Signed-off-by: Vidyullatha Kanchanapally Signed-off-by: Sunil Dutt [change command to take wdev instead of netdev so that it can be used on p2p-device scans] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e568872203a5..9bcaaf7cd15a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2321,6 +2321,8 @@ struct cfg80211_qos_map { * the driver, and will be valid until passed to cfg80211_scan_done(). * For scan results, call cfg80211_inform_bss(); you can call this outside * the scan/scan_done bracket too. + * @abort_scan: Tell the driver to abort an ongoing scan. The driver shall + * indicate the status of the scan through cfg80211_scan_done(). * * @auth: Request to authenticate with the specified peer * (invoked with the wireless_dev mutex held) @@ -2593,6 +2595,7 @@ struct cfg80211_ops { int (*scan)(struct wiphy *wiphy, struct cfg80211_scan_request *request); + void (*abort_scan)(struct wiphy *wiphy, struct wireless_dev *wdev); int (*auth)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_auth_request *req); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 07099cb14778..5b7b5ebe7ca8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -820,6 +820,10 @@ * as an event to indicate changes for devices with wiphy-specific regdom * management. * + * @NL80211_CMD_ABORT_SCAN: Stop an ongoing scan. Returns -ENOENT if a scan is + * not running. The driver indicates the status of the scan through + * cfg80211_scan_done(). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1006,6 +1010,8 @@ enum nl80211_commands { NL80211_CMD_WIPHY_REG_CHANGE, + NL80211_CMD_ABORT_SCAN, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ -- cgit v1.2.3 From 2f8364a291e8adde25c93f97a76abbcaf4b1ed3f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 3 Dec 2015 21:12:31 +0100 Subject: WAN: HDLC: Call notifiers before and after changing device type An HDLC device can change type when the protocol driver is changed. Calling the notifier change allows potential users of the interface know about this planned change, and even block it. After the change has occurred, send a second notification to users can evaluate the new device type etc. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/hdlc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h index 1acb1445e05f..e31bcd4c7859 100644 --- a/include/linux/hdlc.h +++ b/include/linux/hdlc.h @@ -101,7 +101,7 @@ netdev_tx_t hdlc_start_xmit(struct sk_buff *skb, struct net_device *dev); int attach_hdlc_protocol(struct net_device *dev, struct hdlc_proto *proto, size_t size); /* May be used by hardware driver to gain control over HDLC device */ -void detach_hdlc_protocol(struct net_device *dev); +int detach_hdlc_protocol(struct net_device *dev); static __inline__ __be16 hdlc_type_trans(struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3 From b618aaa91b5870e7bd139987ac4b7bf0851142d0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 4 Dec 2015 15:01:31 +0100 Subject: net: constify netif_is_* helpers net_device param As suggested by Eric, these helpers should have const dev param. Suggested-by: Eric Dumazet Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 2 +- include/linux/netdevice.h | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 67ce5bd3b56a..05f5879821b8 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -73,7 +73,7 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) /* found in socket.c */ extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *)); -static inline bool is_vlan_dev(struct net_device *dev) +static inline bool is_vlan_dev(const struct net_device *dev) { return dev->priv_flags & IFF_802_1Q_VLAN; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3efe017fe419..1bb21ff0fa64 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3661,7 +3661,7 @@ extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; void netdev_rss_key_fill(void *buffer, size_t len); int dev_get_nest_level(struct net_device *dev, - bool (*type_check)(struct net_device *dev)); + bool (*type_check)(const struct net_device *dev)); int skb_checksum_help(struct sk_buff *skb); struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); @@ -3858,32 +3858,32 @@ static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, skb->mac_len = mac_len; } -static inline bool netif_is_macvlan(struct net_device *dev) +static inline bool netif_is_macvlan(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN; } -static inline bool netif_is_macvlan_port(struct net_device *dev) +static inline bool netif_is_macvlan_port(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN_PORT; } -static inline bool netif_is_ipvlan(struct net_device *dev) +static inline bool netif_is_ipvlan(const struct net_device *dev) { return dev->priv_flags & IFF_IPVLAN_SLAVE; } -static inline bool netif_is_ipvlan_port(struct net_device *dev) +static inline bool netif_is_ipvlan_port(const struct net_device *dev) { return dev->priv_flags & IFF_IPVLAN_MASTER; } -static inline bool netif_is_bond_master(struct net_device *dev) +static inline bool netif_is_bond_master(const struct net_device *dev) { return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING; } -static inline bool netif_is_bond_slave(struct net_device *dev) +static inline bool netif_is_bond_slave(const struct net_device *dev) { return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING; } @@ -3918,22 +3918,22 @@ static inline bool netif_is_ovs_master(const struct net_device *dev) return dev->priv_flags & IFF_OPENVSWITCH; } -static inline bool netif_is_team_master(struct net_device *dev) +static inline bool netif_is_team_master(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM; } -static inline bool netif_is_team_port(struct net_device *dev) +static inline bool netif_is_team_port(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM_PORT; } -static inline bool netif_is_lag_master(struct net_device *dev) +static inline bool netif_is_lag_master(const struct net_device *dev) { return netif_is_bond_master(dev) || netif_is_team_master(dev); } -static inline bool netif_is_lag_port(struct net_device *dev) +static inline bool netif_is_lag_port(const struct net_device *dev) { return netif_is_bond_slave(dev) || netif_is_team_port(dev); } -- cgit v1.2.3 From 5f61385d2ebc2bd62bc389c7da0d8d2f263be1eb Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Sun, 6 Dec 2015 18:07:41 +0200 Subject: net/mlx4_core: Keep VLAN/MAC tables mirrored in multifunc HA mode Due to HW limitations, indexes to MAC and VLAN tables are always taken from the table of the actual port. So, if a resource holds an index to a table, it may refer to different values during the lifetime of the resource, unless the tables are mirrored. Also, even when driver is not in HA mode the policy of allocating an index to these tables is such to make sure, as much as possible, that when the time comes the mirroring will be successful. This means that in multifunction mode the allocation of a free index in a port's table tries to make sure that the same index in the other's port table is also free. Signed-off-by: Moni Shoua Reviewed-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/driver.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index 5a06d969338e..2e8af001c5da 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -75,6 +75,11 @@ static inline int mlx4_is_bonded(struct mlx4_dev *dev) return !!(dev->flags & MLX4_FLAG_BONDED); } +static inline int mlx4_is_mf_bonded(struct mlx4_dev *dev) +{ + return (mlx4_is_bonded(dev) && mlx4_is_mfunc(dev)); +} + struct mlx4_port_map { u8 port1; u8 port2; -- cgit v1.2.3 From ea3793ee29d3621faf857fa8ef5425e9ff9a756d Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Sun, 6 Dec 2015 21:11:34 +0000 Subject: core: enable more fine-grained datagram reception control The __skb_recv_datagram routine in core/ datagram.c provides a general skb reception factility supposed to be utilized by protocol modules providing datagram sockets. It encompasses both the actual recvmsg code and a surrounding 'sleep until data is available' loop. This is inconvenient if a protocol module has to use additional locking in order to maintain some per-socket state the generic datagram socket code is unaware of (as the af_unix code does). The patch below moves the recvmsg proper code into a new __skb_try_recv_datagram routine which doesn't sleep and renames wait_for_more_packets to __skb_wait_for_more_packets, both routines being exported interfaces. The original __skb_recv_datagram routine is reimplemented on top of these two functions such that its user-visible behaviour remains unchanged. Signed-off-by: Rainer Weikusat Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c9c394bf0771..9b9b9ead7bb3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2785,6 +2785,12 @@ static inline void skb_frag_list_init(struct sk_buff *skb) #define skb_walk_frags(skb, iter) \ for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next) + +int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, + const struct sk_buff *skb); +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags, + int *peeked, int *off, int *err, + struct sk_buff **last); struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, int *peeked, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, -- cgit v1.2.3 From 4baee937b8d551c89f61542a575378e407b63415 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 7 Dec 2015 13:57:32 +0100 Subject: net: dsa: remove DSA link polling Since no more DSA driver uses the polling callback, and since the phylib handles the link detection, remove the link polling work and timer code. Signed-off-by: Neil Armstrong Signed-off-by: David S. Miller --- include/net/dsa.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 3f23dd9d6a69..26a0e86e611e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -116,13 +116,6 @@ struct dsa_switch_tree { s8 cpu_switch; s8 cpu_port; - /* - * Link state polling. - */ - int link_poll_needed; - struct work_struct link_poll_work; - struct timer_list link_poll_timer; - /* * Data for the individual switch chips. */ @@ -231,11 +224,6 @@ struct dsa_switch_driver { int (*phy_write)(struct dsa_switch *ds, int port, int regnum, u16 val); - /* - * Link state polling and IRQ handling. - */ - void (*poll_link)(struct dsa_switch *ds); - /* * Link state adjustment (called from libphy) */ -- cgit v1.2.3 From 8ac2837c89c8c0fcad557e4380aeef80580390f9 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 9 Dec 2015 10:51:12 +0800 Subject: Revert "Merge branch 'vsock-virtio'" This reverts commit 0d76d6e8b2507983a2cae4c09880798079007421 and merge commit c402293bd76fbc93e52ef8c0947ab81eea3ae019, reversing changes made to c89359a42e2a49656451569c382eed63e781153c. The virtio-vsock device specification is not finalized yet. Michael Tsirkin voiced concerned about merging this code when the hardware interface (and possibly the userspace interface) could still change. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 209 -------------------------------------- include/net/af_vsock.h | 2 - include/uapi/linux/virtio_ids.h | 1 - include/uapi/linux/virtio_vsock.h | 89 ---------------- 4 files changed, 301 deletions(-) delete mode 100644 include/linux/virtio_vsock.h delete mode 100644 include/uapi/linux/virtio_vsock.h (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h deleted file mode 100644 index a5f3ecc038f7..000000000000 --- a/include/linux/virtio_vsock.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so - * anyone can use the definitions to implement compatible drivers/servers: - * - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of IBM nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Copyright (C) Red Hat, Inc., 2013-2015 - * Copyright (C) Asias He , 2013 - * Copyright (C) Stefan Hajnoczi , 2015 - */ - -#ifndef _LINUX_VIRTIO_VSOCK_H -#define _LINUX_VIRTIO_VSOCK_H - -#include -#include -#include - -#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 -#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) -#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) -#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) -#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL -#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) -#define VIRTIO_VSOCK_MAX_TX_BUF_SIZE (1024 * 1024 * 16) -#define VIRTIO_VSOCK_MAX_DGRAM_SIZE (1024 * 64) - -struct vsock_transport_recv_notify_data; -struct vsock_transport_send_notify_data; -struct sockaddr_vm; -struct vsock_sock; - -enum { - VSOCK_VQ_CTRL = 0, - VSOCK_VQ_RX = 1, /* for host to guest data */ - VSOCK_VQ_TX = 2, /* for guest to host data */ - VSOCK_VQ_MAX = 3, -}; - -/* virtio transport socket state */ -struct virtio_transport { - struct virtio_transport_pkt_ops *ops; - struct vsock_sock *vsk; - - u32 buf_size; - u32 buf_size_min; - u32 buf_size_max; - - struct mutex tx_lock; - struct mutex rx_lock; - - struct list_head rx_queue; - u32 rx_bytes; - - /* Protected by trans->tx_lock */ - u32 tx_cnt; - u32 buf_alloc; - u32 peer_fwd_cnt; - u32 peer_buf_alloc; - /* Protected by trans->rx_lock */ - u32 fwd_cnt; - - /* Protected by sk_lock */ - u16 dgram_id; - struct list_head incomplete_dgrams; /* dgram fragments */ -}; - -struct virtio_vsock_pkt { - struct virtio_vsock_hdr hdr; - struct virtio_transport *trans; - struct work_struct work; - struct list_head list; - void *buf; - u32 len; - u32 off; -}; - -struct virtio_vsock_pkt_info { - u32 remote_cid, remote_port; - struct msghdr *msg; - u32 pkt_len; - u16 type; - u16 op; - u32 flags; - u16 dgram_id; - u16 dgram_len; -}; - -struct virtio_transport_pkt_ops { - int (*send_pkt)(struct vsock_sock *vsk, - struct virtio_vsock_pkt_info *info); -}; - -void virtio_vsock_dumppkt(const char *func, - const struct virtio_vsock_pkt *pkt); - -struct sock * -virtio_transport_get_pending(struct sock *listener, - struct virtio_vsock_pkt *pkt); -struct virtio_vsock_pkt * -virtio_transport_alloc_pkt(struct vsock_sock *vsk, - struct virtio_vsock_pkt_info *info, - size_t len, - u32 src_cid, - u32 src_port, - u32 dst_cid, - u32 dst_port); -ssize_t -virtio_transport_stream_dequeue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len, - int type); -int -virtio_transport_dgram_dequeue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len, int flags); - -s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); -s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); - -int virtio_transport_do_socket_init(struct vsock_sock *vsk, - struct vsock_sock *psk); -u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); -void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); -int -virtio_transport_notify_poll_in(struct vsock_sock *vsk, - size_t target, - bool *data_ready_now); -int -virtio_transport_notify_poll_out(struct vsock_sock *vsk, - size_t target, - bool *space_available_now); - -int virtio_transport_notify_recv_init(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, - size_t target, ssize_t copied, bool data_read, - struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_send_init(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, - ssize_t written, struct vsock_transport_send_notify_data *data); - -u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); -bool virtio_transport_stream_is_active(struct vsock_sock *vsk); -bool virtio_transport_stream_allow(u32 cid, u32 port); -int virtio_transport_dgram_bind(struct vsock_sock *vsk, - struct sockaddr_vm *addr); -bool virtio_transport_dgram_allow(u32 cid, u32 port); - -int virtio_transport_connect(struct vsock_sock *vsk); - -int virtio_transport_shutdown(struct vsock_sock *vsk, int mode); - -void virtio_transport_release(struct vsock_sock *vsk); - -ssize_t -virtio_transport_stream_enqueue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len); -int -virtio_transport_dgram_enqueue(struct vsock_sock *vsk, - struct sockaddr_vm *remote_addr, - struct msghdr *msg, - size_t len); - -void virtio_transport_destruct(struct vsock_sock *vsk); - -void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt); -u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 wanted); -void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit); -#endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index a0c8fa2ababf..e9eb2d6791b3 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -175,10 +175,8 @@ void vsock_insert_connected(struct vsock_sock *vsk); void vsock_remove_bound(struct vsock_sock *vsk); void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); -struct sock *vsock_find_unbound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); -int vsock_bind_dgram_generic(struct vsock_sock *vsk, struct sockaddr_vm *addr); #endif /* __AF_VSOCK_H__ */ diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 16dcf5d06cd7..77925f587b15 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -39,7 +39,6 @@ #define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ #define VIRTIO_ID_CAIF 12 /* Virtio caif */ -#define VIRTIO_ID_VSOCK 13 /* virtio vsock transport */ #define VIRTIO_ID_GPU 16 /* virtio GPU */ #define VIRTIO_ID_INPUT 18 /* virtio input */ diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h deleted file mode 100644 index 8cf9b5682628..000000000000 --- a/include/uapi/linux/virtio_vsock.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so - * anyone can use the definitions to implement compatible drivers/servers: - * - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of IBM nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Copyright (C) Red Hat, Inc., 2013-2015 - * Copyright (C) Asias He , 2013 - * Copyright (C) Stefan Hajnoczi , 2015 - */ - -#ifndef _UAPI_LINUX_VIRTIO_VSOCK_H -#define _UAPI_LINUX_VIRTIO_VOSCK_H - -#include -#include -#include - -struct virtio_vsock_config { - __le32 guest_cid; - __le32 max_virtqueue_pairs; -}; - -struct virtio_vsock_hdr { - __le32 src_cid; - __le32 src_port; - __le32 dst_cid; - __le32 dst_port; - __le32 len; - __le16 type; /* enum virtio_vsock_type */ - __le16 op; /* enum virtio_vsock_op */ - __le32 flags; - __le32 buf_alloc; - __le32 fwd_cnt; -}; - -enum virtio_vsock_type { - VIRTIO_VSOCK_TYPE_STREAM = 1, - VIRTIO_VSOCK_TYPE_DGRAM = 2, -}; - -enum virtio_vsock_op { - VIRTIO_VSOCK_OP_INVALID = 0, - - /* Connect operations */ - VIRTIO_VSOCK_OP_REQUEST = 1, - VIRTIO_VSOCK_OP_RESPONSE = 2, - VIRTIO_VSOCK_OP_ACK = 3, - VIRTIO_VSOCK_OP_RST = 4, - VIRTIO_VSOCK_OP_SHUTDOWN = 5, - - /* To send payload */ - VIRTIO_VSOCK_OP_RW = 6, - - /* Tell the peer our credit info */ - VIRTIO_VSOCK_OP_CREDIT_UPDATE = 7, - /* Request the peer to send the credit info to us */ - VIRTIO_VSOCK_OP_CREDIT_REQUEST = 8, -}; - -/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */ -enum virtio_vsock_shutdown { - VIRTIO_VSOCK_SHUTDOWN_RCV = 1, - VIRTIO_VSOCK_SHUTDOWN_SEND = 2, -}; - -#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */ -- cgit v1.2.3 From 297dbde19cf6a0ccb6fd4396c6220a5912ed61e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:51 -0500 Subject: netprio_cgroup: limit the maximum css->id to USHRT_MAX netprio builds per-netdev contiguous priomap array which is indexed by css->id. The array is allocated using kzalloc() effectively limiting the maximum ID supported to some thousand range. This patch caps the maximum supported css->id to USHRT_MAX which should be way above what is actually useable. This allows reducing sock->sk_cgrp_prioidx to u16 from u32. The freed up part will be used to overload the cgroup related fields. sock->sk_cgrp_prioidx's position is swapped with sk_mark so that the two cgroup related fields are adjacent. Signed-off-by: Tejun Heo Acked-by: Daniel Wagner Cc: Daniel Borkmann CC: Neil Horman Signed-off-by: David S. Miller --- include/net/sock.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 6f58b84fc742..a95bcf7d6efa 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -288,7 +288,6 @@ struct cg_proto; * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_priority: %SO_PRIORITY setting - * @sk_cgrp_prioidx: socket group's priority map index * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family * @sk_peer_pid: &struct pid for this socket's peer @@ -309,6 +308,7 @@ struct cg_proto; * @sk_send_head: front of stuff to transmit * @sk_security: used by security modules * @sk_mark: generic packet mark + * @sk_cgrp_prioidx: socket group's priority map index * @sk_classid: this socket's cgroup classid * @sk_cgrp: this socket's cgroup-specific proto data * @sk_write_pending: a write to stream socket waits to start @@ -425,9 +425,7 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; __u32 sk_priority; -#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) - __u32 sk_cgrp_prioidx; -#endif + __u32 sk_mark; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; @@ -445,7 +443,9 @@ struct sock { #ifdef CONFIG_SECURITY void *sk_security; #endif - __u32 sk_mark; +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) + u16 sk_cgrp_prioidx; +#endif #ifdef CONFIG_CGROUP_NET_CLASSID u32 sk_classid; #endif -- cgit v1.2.3 From 2a56a1fec290bf0bc4676bbf4efdb3744953a3e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:52 -0500 Subject: net: wrap sock->sk_cgrp_prioidx and ->sk_classid inside a struct Introduce sock->sk_cgrp_data which is a struct sock_cgroup_data. ->sk_cgroup_prioidx and ->sk_classid are moved into it. The struct and its accessors are defined in cgroup-defs.h. This is to prepare for overloading the fields with a cgroup pointer. This patch mostly performs equivalent conversions but the followings are noteworthy. * Equality test before updating classid is removed from sock_update_classid(). This shouldn't make any noticeable difference and a similar test will be implemented on the helper side later. * sock_update_netprioidx() now takes struct sock_cgroup_data and can be moved to netprio_cgroup.h without causing include dependency loop. Moved. * The dummy version of sock_update_netprioidx() converted to a static inline function while at it. Signed-off-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 36 ++++++++++++++++++++++++++++++++++++ include/net/cls_cgroup.h | 11 +++++------ include/net/netprio_cgroup.h | 16 +++++++++++++--- include/net/sock.h | 11 +++-------- 4 files changed, 57 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 504d8591b6d3..ed128fed0335 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -542,4 +542,40 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #endif /* CONFIG_CGROUPS */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +struct sock_cgroup_data { + u16 prioidx; + u32 classid; +}; + +static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) +{ + return skcd->prioidx; +} + +static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) +{ + return skcd->classid; +} + +static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, + u16 prioidx) +{ + skcd->prioidx = prioidx; +} + +static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, + u32 classid) +{ + skcd->classid = classid; +} + +#else /* CONFIG_SOCK_CGROUP_DATA */ + +struct sock_cgroup_data { +}; + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + #endif /* _LINUX_CGROUP_DEFS_H */ diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index ccd6d8bffa4d..c0a92e2c286d 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -41,13 +41,12 @@ static inline u32 task_cls_classid(struct task_struct *p) return classid; } -static inline void sock_update_classid(struct sock *sk) +static inline void sock_update_classid(struct sock_cgroup_data *skcd) { u32 classid; classid = task_cls_classid(current); - if (classid != sk->sk_classid) - sk->sk_classid = classid; + sock_cgroup_set_classid(skcd, classid); } static inline u32 task_get_classid(const struct sk_buff *skb) @@ -64,17 +63,17 @@ static inline u32 task_get_classid(const struct sk_buff *skb) * softirqs always disables bh. */ if (in_serving_softirq()) { - /* If there is an sk_classid we'll use that. */ + /* If there is an sock_cgroup_classid we'll use that. */ if (!skb->sk) return 0; - classid = skb->sk->sk_classid; + classid = sock_cgroup_classid(&skb->sk->sk_cgrp_data); } return classid; } #else /* !CONFIG_CGROUP_NET_CLASSID */ -static inline void sock_update_classid(struct sock *sk) +static inline void sock_update_classid(struct sock_cgroup_data *skcd) { } diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index f2a9597ff53c..604190596cde 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -25,8 +25,6 @@ struct netprio_map { u32 priomap[]; }; -void sock_update_netprioidx(struct sock *sk); - static inline u32 task_netprioidx(struct task_struct *p) { struct cgroup_subsys_state *css; @@ -38,13 +36,25 @@ static inline u32 task_netprioidx(struct task_struct *p) rcu_read_unlock(); return idx; } + +static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd) +{ + if (in_interrupt()) + return; + + sock_cgroup_set_prioidx(skcd, task_netprioidx(current)); +} + #else /* !CONFIG_CGROUP_NET_PRIO */ + static inline u32 task_netprioidx(struct task_struct *p) { return 0; } -#define sock_update_netprioidx(sk) +static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd) +{ +} #endif /* CONFIG_CGROUP_NET_PRIO */ #endif /* _NET_CLS_CGROUP_H */ diff --git a/include/net/sock.h b/include/net/sock.h index a95bcf7d6efa..0ca22b014de1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -308,8 +309,7 @@ struct cg_proto; * @sk_send_head: front of stuff to transmit * @sk_security: used by security modules * @sk_mark: generic packet mark - * @sk_cgrp_prioidx: socket group's priority map index - * @sk_classid: this socket's cgroup classid + * @sk_cgrp_data: cgroup data for this cgroup * @sk_cgrp: this socket's cgroup-specific proto data * @sk_write_pending: a write to stream socket waits to start * @sk_state_change: callback to indicate change in the state of the sock @@ -443,12 +443,7 @@ struct sock { #ifdef CONFIG_SECURITY void *sk_security; #endif -#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) - u16 sk_cgrp_prioidx; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - u32 sk_classid; -#endif + struct sock_cgroup_data sk_cgrp_data; struct cg_proto *sk_cgrp; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk); -- cgit v1.2.3 From bd1060a1d67128bb8fbe2e1384c518912cbe54e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:53 -0500 Subject: sock, cgroup: add sock->sk_cgroup In cgroup v1, dealing with cgroup membership was difficult because the number of membership associations was unbound. As a result, cgroup v1 grew several controllers whose primary purpose is either tagging membership or pull in configuration knobs from other subsystems so that cgroup membership test can be avoided. net_cls and net_prio controllers are examples of the latter. They allow configuring network-specific attributes from cgroup side so that network subsystem can avoid testing cgroup membership; unfortunately, these are not only cumbersome but also problematic. Both net_cls and net_prio aren't properly hierarchical. Both inherit configuration from the parent on creation but there's no interaction afterwards. An ancestor doesn't restrict the behavior in its subtree in anyway and configuration changes aren't propagated downwards. Especially when combined with cgroup delegation, this is problematic because delegatees can mess up whatever network configuration implemented at the system level. net_prio would allow the delegatees to set whatever priority value regardless of CAP_NET_ADMIN and net_cls the same for classid. While it is possible to solve these issues from controller side by implementing hierarchical allowable ranges in both controllers, it would involve quite a bit of complexity in the controllers and further obfuscate network configuration as it becomes even more difficult to tell what's actually being configured looking from the network side. While not much can be done for v1 at this point, as membership handling is sane on cgroup v2, it'd be better to make cgroup matching behave like other network matches and classifiers than introducing further complications. In preparation, this patch updates sock->sk_cgrp_data handling so that it points to the v2 cgroup that sock was created in until either net_prio or net_cls is used. Once either of the two is used, sock->sk_cgrp_data reverts to its previous role of carrying prioidx and classid. This is to avoid adding yet another cgroup related field to struct sock. As the mode switching can happen at most once per boot, the switching mechanism is aimed at lowering hot path overhead. It may leak a finite, likely small, number of cgroup refs and report spurious prioidx or classid on switching; however, dynamic updates of prioidx and classid have always been racy and lossy - socks between creation and fd installation are never updated, config changes don't update existing sockets at all, and prioidx may index with dead and recycled cgroup IDs. Non-critical inaccuracies from small race windows won't make any noticeable difference. This patch doesn't make use of the pointer yet. The following patch will implement netfilter match for cgroup2 membership. v2: Use sock_cgroup_data to avoid inflating struct sock w/ another cgroup specific field. v3: Add comments explaining why sock_data_prioidx() and sock_data_classid() use different fallback values. Signed-off-by: Tejun Heo Cc: Daniel Borkmann Cc: Daniel Wagner CC: Neil Horman Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 88 +++++++++++++++++++++++++++++++++++++++++---- include/linux/cgroup.h | 41 +++++++++++++++++++++ 2 files changed, 123 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ed128fed0335..9dc226345e4e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -544,31 +544,107 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #ifdef CONFIG_SOCK_CGROUP_DATA +/* + * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains + * per-socket cgroup information except for memcg association. + * + * On legacy hierarchies, net_prio and net_cls controllers directly set + * attributes on each sock which can then be tested by the network layer. + * On the default hierarchy, each sock is associated with the cgroup it was + * created in and the networking layer can match the cgroup directly. + * + * To avoid carrying all three cgroup related fields separately in sock, + * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer. + * On boot, sock_cgroup_data records the cgroup that the sock was created + * in so that cgroup2 matches can be made; however, once either net_prio or + * net_cls starts being used, the area is overriden to carry prioidx and/or + * classid. The two modes are distinguished by whether the lowest bit is + * set. Clear bit indicates cgroup pointer while set bit prioidx and + * classid. + * + * While userland may start using net_prio or net_cls at any time, once + * either is used, cgroup2 matching no longer works. There is no reason to + * mix the two and this is in line with how legacy and v2 compatibility is + * handled. On mode switch, cgroup references which are already being + * pointed to by socks may be leaked. While this can be remedied by adding + * synchronization around sock_cgroup_data, given that the number of leaked + * cgroups is bound and highly unlikely to be high, this seems to be the + * better trade-off. + */ struct sock_cgroup_data { - u16 prioidx; - u32 classid; + union { +#ifdef __LITTLE_ENDIAN + struct { + u8 is_data; + u8 padding; + u16 prioidx; + u32 classid; + } __packed; +#else + struct { + u32 classid; + u16 prioidx; + u8 padding; + u8 is_data; + } __packed; +#endif + u64 val; + }; }; +/* + * There's a theoretical window where the following accessors race with + * updaters and return part of the previous pointer as the prioidx or + * classid. Such races are short-lived and the result isn't critical. + */ static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) { - return skcd->prioidx; + /* fallback to 1 which is always the ID of the root cgroup */ + return (skcd->is_data & 1) ? skcd->prioidx : 1; } static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) { - return skcd->classid; + /* fallback to 0 which is the unconfigured default classid */ + return (skcd->is_data & 1) ? skcd->classid : 0; } +/* + * If invoked concurrently, the updaters may clobber each other. The + * caller is responsible for synchronization. + */ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - skcd->prioidx = prioidx; + struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + + if (sock_cgroup_prioidx(&skcd_buf) == prioidx) + return; + + if (!(skcd_buf.is_data & 1)) { + skcd_buf.val = 0; + skcd_buf.is_data = 1; + } + + skcd_buf.prioidx = prioidx; + WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - skcd->classid = classid; + struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + + if (sock_cgroup_classid(&skcd_buf) == classid) + return; + + if (!(skcd_buf.is_data & 1)) { + skcd_buf.val = 0; + skcd_buf.is_data = 1; + } + + skcd_buf.classid = classid; + WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ } #else /* CONFIG_SOCK_CGROUP_DATA */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4c3ffab81ba7..a8ba1ea0ea5a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -578,4 +578,45 @@ static inline int cgroup_init(void) { return 0; } #endif /* !CONFIG_CGROUPS */ +/* + * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data + * definition in cgroup-defs.h. + */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) +extern spinlock_t cgroup_sk_update_lock; +#endif + +void cgroup_sk_alloc_disable(void); +void cgroup_sk_alloc(struct sock_cgroup_data *skcd); +void cgroup_sk_free(struct sock_cgroup_data *skcd); + +static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) +{ +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) + unsigned long v; + + /* + * @skcd->val is 64bit but the following is safe on 32bit too as we + * just need the lower ulong to be written and read atomically. + */ + v = READ_ONCE(skcd->val); + + if (v & 1) + return &cgrp_dfl_root.cgrp; + + return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; +#else + return (struct cgroup *)(unsigned long)skcd->val; +#endif +} + +#else /* CONFIG_CGROUP_DATA */ + +static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} +static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} + +#endif /* CONFIG_CGROUP_DATA */ + #endif /* _LINUX_CGROUP_H */ -- cgit v1.2.3 From 33d5a7b14bfd02e60af9d223db8dfff0cbcabe6b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 28 Nov 2015 21:53:04 +0100 Subject: netfilter: nf_tables: extend tracing infrastructure nft monitor mode can then decode and display this trace data. Parts of LL/Network/Transport headers are provided as separate attributes. Otherwise, printing IP address data becomes virtually impossible for userspace since in the case of the netdev family we really don't want userspace to have to know all the possible link layer types and/or sizes just to display/print an ip address. We also don't want userspace to have to follow ipv6 header chains to get the s/dport info, the kernel already did this work for us. To avoid bloating nft_do_chain all data required for tracing is encapsulated in nft_traceinfo. The structure is initialized unconditionally(!) for each nft_do_chain invocation. This unconditionall call will be moved under a static key in a followup patch. With lots of help from Patrick McHardy and Pablo Neira. Signed-off-by: Florian Westphal Acked-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 32 ++++++++++++++++++++ include/uapi/linux/netfilter/nf_tables.h | 52 ++++++++++++++++++++++++++++++++ include/uapi/linux/netfilter/nfnetlink.h | 2 ++ 3 files changed, 86 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 101d7d7ec243..b313cda49194 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -888,6 +888,38 @@ void nft_unregister_chain_type(const struct nf_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); +int nft_verdict_dump(struct sk_buff *skb, int type, + const struct nft_verdict *v); + +/** + * struct nft_traceinfo - nft tracing information and state + * + * @pkt: pktinfo currently processed + * @basechain: base chain currently processed + * @chain: chain currently processed + * @rule: rule that was evaluated + * @verdict: verdict given by rule + * @type: event type (enum nft_trace_types) + * @packet_dumped: packet headers sent in a previous traceinfo message + * @trace: other struct members are initialised + */ +struct nft_traceinfo { + const struct nft_pktinfo *pkt; + const struct nft_base_chain *basechain; + const struct nft_chain *chain; + const struct nft_rule *rule; + const struct nft_verdict *verdict; + enum nft_trace_types type; + bool packet_dumped; + bool trace; +}; + +void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, + const struct nft_verdict *verdict, + const struct nft_chain *basechain); + +void nft_trace_notify(struct nft_traceinfo *info); + #define nft_dereference(p) \ nfnl_dereference(p, NFNL_SUBSYS_NFTABLES) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 5f3ececf84b3..b48a3ab761f8 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -83,6 +83,7 @@ enum nft_verdicts { * @NFT_MSG_DELSETELEM: delete a set element (enum nft_set_elem_attributes) * @NFT_MSG_NEWGEN: announce a new generation, only for events (enum nft_gen_attributes) * @NFT_MSG_GETGEN: get the rule-set generation (enum nft_gen_attributes) + * @NFT_MSG_TRACE: trace event (enum nft_trace_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -102,6 +103,7 @@ enum nf_tables_msg_types { NFT_MSG_DELSETELEM, NFT_MSG_NEWGEN, NFT_MSG_GETGEN, + NFT_MSG_TRACE, NFT_MSG_MAX, }; @@ -987,4 +989,54 @@ enum nft_gen_attributes { }; #define NFTA_GEN_MAX (__NFTA_GEN_MAX - 1) +/** + * enum nft_trace_attributes - nf_tables trace netlink attributes + * + * @NFTA_TRACE_TABLE: name of the table (NLA_STRING) + * @NFTA_TRACE_CHAIN: name of the chain (NLA_STRING) + * @NFTA_TRACE_RULE_HANDLE: numeric handle of the rule (NLA_U64) + * @NFTA_TRACE_TYPE: type of the event (NLA_U32: nft_trace_types) + * @NFTA_TRACE_VERDICT: verdict returned by hook (NLA_NESTED: nft_verdicts) + * @NFTA_TRACE_ID: pseudo-id, same for each skb traced (NLA_U32) + * @NFTA_TRACE_LL_HEADER: linklayer header (NLA_BINARY) + * @NFTA_TRACE_NETWORK_HEADER: network header (NLA_BINARY) + * @NFTA_TRACE_TRANSPORT_HEADER: transport header (NLA_BINARY) + * @NFTA_TRACE_IIF: indev ifindex (NLA_U32) + * @NFTA_TRACE_IIFTYPE: netdev->type of indev (NLA_U16) + * @NFTA_TRACE_OIF: outdev ifindex (NLA_U32) + * @NFTA_TRACE_OIFTYPE: netdev->type of outdev (NLA_U16) + * @NFTA_TRACE_MARK: nfmark (NLA_U32) + * @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32) + * @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32) + */ +enum nft_trace_attibutes { + NFTA_TRACE_UNSPEC, + NFTA_TRACE_TABLE, + NFTA_TRACE_CHAIN, + NFTA_TRACE_RULE_HANDLE, + NFTA_TRACE_TYPE, + NFTA_TRACE_VERDICT, + NFTA_TRACE_ID, + NFTA_TRACE_LL_HEADER, + NFTA_TRACE_NETWORK_HEADER, + NFTA_TRACE_TRANSPORT_HEADER, + NFTA_TRACE_IIF, + NFTA_TRACE_IIFTYPE, + NFTA_TRACE_OIF, + NFTA_TRACE_OIFTYPE, + NFTA_TRACE_MARK, + NFTA_TRACE_NFPROTO, + NFTA_TRACE_POLICY, + __NFTA_TRACE_MAX +}; +#define NFTA_TRACE_MAX (__NFTA_TRACE_MAX - 1) + +enum nft_trace_types { + NFT_TRACETYPE_UNSPEC, + NFT_TRACETYPE_POLICY, + NFT_TRACETYPE_RETURN, + NFT_TRACETYPE_RULE, + __NFT_TRACETYPE_MAX +}; +#define NFT_TRACETYPE_MAX (__NFT_TRACETYPE_MAX - 1) #endif /* _LINUX_NF_TABLES_H */ diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h index 354a7e5e50f2..4bb8cb7730e7 100644 --- a/include/uapi/linux/netfilter/nfnetlink.h +++ b/include/uapi/linux/netfilter/nfnetlink.h @@ -22,6 +22,8 @@ enum nfnetlink_groups { #define NFNLGRP_NFTABLES NFNLGRP_NFTABLES NFNLGRP_ACCT_QUOTA, #define NFNLGRP_ACCT_QUOTA NFNLGRP_ACCT_QUOTA + NFNLGRP_NFTRACE, +#define NFNLGRP_NFTRACE NFNLGRP_NFTRACE __NFNLGRP_MAX, }; #define NFNLGRP_MAX (__NFNLGRP_MAX - 1) -- cgit v1.2.3 From e639f7ab079b5256660018511d87aa34b54f1a9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 28 Nov 2015 21:53:05 +0100 Subject: netfilter: nf_tables: wrap tracing with a static key Only needed when meta nftrace rule(s) were added. The assumption is that no such rules are active, so the call to nft_trace_init is "never" needed. When nftrace rules are active, we always call the nft_trace_* functions, but will only send netlink messages when all of the following are true: - traceinfo structure was initialised - skb->nf_trace == 1 - at least one subscriber to trace group. Adding an extra conditional (static_branch ... && skb->nf_trace) nft_trace_init( ..) Is possible but results in a larger nft_do_chain footprint. Signed-off-by: Florian Westphal Acked-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 1 + include/net/netfilter/nft_meta.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 4ff5424909aa..a9060dd99db7 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -57,6 +57,7 @@ struct nft_payload_set { }; extern const struct nft_expr_ops nft_payload_fast_ops; +extern struct static_key_false nft_trace_enabled; int nft_payload_module_init(void); void nft_payload_module_exit(void); diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index 711887a09e91..d27588c8dbd9 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -33,4 +33,7 @@ void nft_meta_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); +void nft_meta_set_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr); + #endif -- cgit v1.2.3 From ad2c8c73d29702c3193f739390f6661f9a4ecad9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Dec 2015 12:30:46 -0500 Subject: cgroup: fix sock_cgroup_data initialization on earlier compilers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sock_cgroup_data is a struct containing an anonymous union. sock_cgroup_set_prioidx() and sock_cgroup_set_classid() were initializing a field inside the anonymous union as follows. struct sock_ccgroup_data skcd_buf = { .val = VAL }; While this is fine on more recent compilers, gcc-4.4.7 triggers the following errors. include/linux/cgroup-defs.h: In function ‘sock_cgroup_set_prioidx’: include/linux/cgroup-defs.h:619: error: unknown field ‘val’ specified in initializer include/linux/cgroup-defs.h:619: warning: missing braces around initializer include/linux/cgroup-defs.h:619: warning: (near initialization for ‘skcd_buf.’) This is because .val belongs to the anonymous union nested inside the struct but the initializer is missing the nesting. Fix it by adding an extra pair of braces. Signed-off-by: Tejun Heo Reported-by: Alaa Hleihel Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 9dc226345e4e..097901a68671 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -616,7 +616,7 @@ static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; if (sock_cgroup_prioidx(&skcd_buf) == prioidx) return; @@ -633,7 +633,7 @@ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; if (sock_cgroup_classid(&skcd_buf) == classid) return; -- cgit v1.2.3 From 01b1cb87d37fb19cdaa5e7002416fdde156873d0 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 16 Nov 2015 12:52:21 +0200 Subject: Bluetooth: Run page scan updates through hdev->req_workqueue Since Add/Remove Device perform the page scan updates independently from the HCI command completion we've introduced a potential race when multiple mgmt commands are queued. Doing the page scan updates through the req_workqueue ensures that the state changes are performed in a race-free manner. At the same time, to make the request helper more widely usable, extend it to also cover Inquiry Scan changes since those are behind the same HCI command. This is also reflected in the new name of the API as well as the work struct name. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 55ce209157b1..eda809a5c3df 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -329,6 +329,7 @@ struct hci_dev { struct work_struct discov_update; struct work_struct bg_scan_update; + struct work_struct scan_update; struct delayed_work le_scan_disable; struct delayed_work le_scan_restart; -- cgit v1.2.3 From f22525700b2ae34eb97a29a91e2eee902062b484 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 18 Nov 2015 12:49:20 +0200 Subject: Bluetooth: Move advertising instance management to hci_request.c This paves the way for eventually performing advertising changes through the hdev->req_workqueue. Some new APIs need to be exposed from mgmt.c to hci_request.c and vice-versa, but many of them will go away once hdev->req_workqueue gets used. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index eda809a5c3df..b56085b6ecce 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1434,9 +1434,7 @@ void mgmt_index_added(struct hci_dev *hdev); void mgmt_index_removed(struct hci_dev *hdev); void mgmt_set_powered_failed(struct hci_dev *hdev, int err); int mgmt_powered(struct hci_dev *hdev, u8 powered); -int mgmt_update_adv_data(struct hci_dev *hdev); void mgmt_discoverable_timeout(struct hci_dev *hdev); -void mgmt_adv_timeout_expired(struct hci_dev *hdev); void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, bool persistent); void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, @@ -1491,8 +1489,13 @@ void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type, u8 store_hint, u16 min_interval, u16 max_interval, u16 latency, u16 timeout); -void mgmt_reenable_advertising(struct hci_dev *hdev); void mgmt_smp_complete(struct hci_conn *conn, bool complete); +bool mgmt_get_connectable(struct hci_dev *hdev); +u8 mgmt_get_adv_discov_flags(struct hci_dev *hdev); +void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, + u8 instance); +void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, + u8 instance); u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); -- cgit v1.2.3 From 53c0ba74510c1182786dcd1e3710215467777601 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Sun, 22 Nov 2015 16:43:43 +0300 Subject: Bluetooth: Move connectable changes to hdev->req_workqueue This way the connectable changes are synchronized against each other, which helps avoid potential races. The connectable mode is also linked together with LE advertising which makes is more convenient to have it behind the same workqueue. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b56085b6ecce..a855e41df68c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -330,6 +330,7 @@ struct hci_dev { struct work_struct discov_update; struct work_struct bg_scan_update; struct work_struct scan_update; + struct work_struct connectable_update; struct delayed_work le_scan_disable; struct delayed_work le_scan_restart; @@ -1491,6 +1492,7 @@ void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr, u16 max_interval, u16 latency, u16 timeout); void mgmt_smp_complete(struct hci_conn *conn, bool complete); bool mgmt_get_connectable(struct hci_dev *hdev); +void mgmt_set_connectable_complete(struct hci_dev *hdev, u8 status); u8 mgmt_get_adv_discov_flags(struct hci_dev *hdev); void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); -- cgit v1.2.3 From aed1a8851db022c3bd22af41a343068b8c6e40c1 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Sun, 22 Nov 2015 17:24:44 +0300 Subject: Bluetooth: Move discoverable changes to hdev->req_workqueue The discoverable mode is intrinsically linked with the connectable mode e.g. through sharing the same HCI command (Write Scan Enable) for BR/EDR. It makes therefore sense to move it to hci_request.c and run the changes through the same hdev->req_workqueue. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a855e41df68c..0a6966ed7ee1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -331,6 +331,7 @@ struct hci_dev { struct work_struct bg_scan_update; struct work_struct scan_update; struct work_struct connectable_update; + struct work_struct discoverable_update; struct delayed_work le_scan_disable; struct delayed_work le_scan_restart; @@ -1493,6 +1494,7 @@ void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr, void mgmt_smp_complete(struct hci_conn *conn, bool complete); bool mgmt_get_connectable(struct hci_dev *hdev); void mgmt_set_connectable_complete(struct hci_dev *hdev, u8 status); +void mgmt_set_discoverable_complete(struct hci_dev *hdev, u8 status); u8 mgmt_get_adv_discov_flags(struct hci_dev *hdev); void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); -- cgit v1.2.3 From c366f555b8df67633b849a5088bb897d6c63aaa5 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Mon, 23 Nov 2015 15:43:06 +0200 Subject: Bluetooth: Move discoverable timeout behind hdev->req_workqueue Since the other discoverable changes are behind req_workqueue now it only makes sense to move the discoverable timeout there as well. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 0a6966ed7ee1..319bf020cea6 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1436,7 +1436,6 @@ void mgmt_index_added(struct hci_dev *hdev); void mgmt_index_removed(struct hci_dev *hdev); void mgmt_set_powered_failed(struct hci_dev *hdev, int err); int mgmt_powered(struct hci_dev *hdev, u8 powered); -void mgmt_discoverable_timeout(struct hci_dev *hdev); void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, bool persistent); void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, -- cgit v1.2.3 From 2ff13894cfb877cb3d02d96a8402202f0a6f3efd Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 25 Nov 2015 16:15:44 +0200 Subject: Bluetooth: Perform HCI update for power on synchronously The request to update HCI during power on is always coming either from hdev->req_workqueue or through an ioctl, so it's safe to use hci_req_sync for it. This way we also eliminate potential races with incoming mgmt commands or other actions while powering on. Part of this refactoring is the splitting of mgmt_powered() into mgmt_power_on() and __mgmt_power_off() functions. The main reason is the different requirements as far as hdev locking is concerned, as highlighted with the __ prefix of the power off API. Since the power on in the case of clearing the AUTO_OFF flag cannot be done synchronously in the set_powered mgmt handler, the hci_power_on work callback is extended to cover this (which also simplifies the set_powered helper a lot). Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 319bf020cea6..c95e0326c41a 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1435,7 +1435,8 @@ int mgmt_new_settings(struct hci_dev *hdev); void mgmt_index_added(struct hci_dev *hdev); void mgmt_index_removed(struct hci_dev *hdev); void mgmt_set_powered_failed(struct hci_dev *hdev, int err); -int mgmt_powered(struct hci_dev *hdev, u8 powered); +void mgmt_power_on(struct hci_dev *hdev, int err); +void __mgmt_power_off(struct hci_dev *hdev); void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, bool persistent); void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, -- cgit v1.2.3 From 17fd08ffb5981cff2c921eb479f46b872b02b2b9 Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 26 Nov 2015 12:15:59 +0200 Subject: Bluetooth: Remove unnecessary HCI_ADVERTISING_INSTANCE flag This flag just tells us whether hdev->adv_instances is empty or not. We can equally well use the list_empty() function to get this information. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index cc2216727655..339ea57be423 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -239,7 +239,6 @@ enum { HCI_LE_ENABLED, HCI_ADVERTISING, HCI_ADVERTISING_CONNECTABLE, - HCI_ADVERTISING_INSTANCE, HCI_CONNECTABLE, HCI_DISCOVERABLE, HCI_LIMITED_DISCOVERABLE, -- cgit v1.2.3 From 00f59314111a6b18ee65b238b38c470dbdbf3be5 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 9 Dec 2015 22:46:29 +0100 Subject: 6lowpan: add lowpan dev register helpers This patch introduces register and unregister functionality for lowpan interfaces. While register a lowpan interface there are several things which need to be initialize by the 6lowpan subsystem. Upcoming functionality need to register/unregister per interface components e.g. debugfs entry. Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/6lowpan.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h index cf3bc564ac03..730211fd8ed7 100644 --- a/include/net/6lowpan.h +++ b/include/net/6lowpan.h @@ -185,7 +185,12 @@ static inline void lowpan_push_hc_data(u8 **hc_ptr, const void *data, *hc_ptr += len; } -void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype); +int lowpan_register_netdevice(struct net_device *dev, + enum lowpan_lltypes lltype); +int lowpan_register_netdev(struct net_device *dev, + enum lowpan_lltypes lltype); +void lowpan_unregister_netdevice(struct net_device *dev); +void lowpan_unregister_netdev(struct net_device *dev); /** * lowpan_header_decompress - replace 6LoWPAN header with IPv6 header -- cgit v1.2.3 From b1815fd949e5bd06d118019acf68f87c9414f705 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 9 Dec 2015 22:46:30 +0100 Subject: 6lowpan: add debugfs support This patch will introduce a 6lowpan entry into the debugfs if enabled. Inside this 6lowpan directory we create a subdirectories of all 6lowpan interfaces to offer a per interface debugfs support. Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/6lowpan.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h index 730211fd8ed7..2f6a3f2233ed 100644 --- a/include/net/6lowpan.h +++ b/include/net/6lowpan.h @@ -53,6 +53,8 @@ #ifndef __6LOWPAN_H__ #define __6LOWPAN_H__ +#include + #include #include @@ -98,6 +100,7 @@ enum lowpan_lltypes { struct lowpan_priv { enum lowpan_lltypes lltype; + struct dentry *iface_debugfs; /* must be last */ u8 priv[0] __aligned(sizeof(void *)); -- cgit v1.2.3 From 818f1f3e70dd4c8e6f8d59c617857be0fa0fce7c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 9 Dec 2015 22:46:31 +0100 Subject: ipv6: add ipv6_addr_prefix_copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a static inline function ipv6_addr_prefix_copy which copies a ipv6 address prefix(argument pfx) into the ipv6 address prefix. The prefix len is given by plen as bits. This function mainly based on ipv6_addr_prefix which copies one address prefix from address into a new ipv6 address destination and zero all other address bits. The difference is that ipv6_addr_prefix_copy don't get a prefix from an ipv6 address, it sets a prefix to an ipv6 address with keeping other address bits. The use case is for context based address compression inside 6LoWPAN IPHC header which keeping ipv6 prefixes inside a context table to lookup address-bits without sending them. Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Acked-by: Łukasz Duda Acked-by: Hannes Frederic Sowa Acked-by: YOSHIFUJI Hideaki Acked-by: David S. Miller Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/ipv6.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 9a5c9f013784..6570f379aba2 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -401,6 +401,21 @@ static inline void ipv6_addr_prefix(struct in6_addr *pfx, pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b); } +static inline void ipv6_addr_prefix_copy(struct in6_addr *addr, + const struct in6_addr *pfx, + int plen) +{ + /* caller must guarantee 0 <= plen <= 128 */ + int o = plen >> 3, + b = plen & 0x7; + + memcpy(addr->s6_addr, pfx, o); + if (b != 0) { + addr->s6_addr[o] &= ~(0xff00 >> b); + addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b)); + } +} + static inline void __ipv6_addr_set_half(__be32 *addr, __be32 wh, __be32 wl) { -- cgit v1.2.3 From 899077791403ff7a2d8cfaa87bd1a82d729463e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 8 Dec 2015 16:32:27 +0100 Subject: netcp: try to reduce type confusion in descriptors The netcp driver produces tons of warnings when CONFIG_LPAE is enabled on ARM: drivers/net/ethernet/ti/netcp_core.c: In function 'netcp_tx_map_skb': drivers/net/ethernet/ti/netcp_core.c:1084:13: warning: passing argument 1 of 'set_words' from incompatible pointer type [-Wincompatible-pointer-types] This is the result of trying to pass a pointer to a dma_addr_t to a function that expects a u32 pointer to copy that into a DMA descriptor. Looking at that code in more detail to fix the warnings, I see multiple related problems: * The conversion functions are not endian-safe, as the DMA descriptors are almost certainly fixed-endian, but the CPU is not. * On 64-bit machines, passing a pointer through a u32 variable is a bug, accessing an indirect pointer as a u32 pointer even more so. * The handling of epib and psdata mixes native-endian and device-endian data. In this patch, I try to sort out the types for most accesses here, adding le32_to_cpu/cpu_to_le32 where appropriate, and passing pointers through two 32-bit words in the descriptor padding, to make it plausible that the driver does the right thing if compiled for big-endian or 64-bit systems. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/soc/ti/knav_dma.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h index dad035c16d94..343c13ac4f71 100644 --- a/include/linux/soc/ti/knav_dma.h +++ b/include/linux/soc/ti/knav_dma.h @@ -144,17 +144,17 @@ struct knav_dma_cfg { * @psdata: Protocol specific */ struct knav_dma_desc { - u32 desc_info; - u32 tag_info; - u32 packet_info; - u32 buff_len; - u32 buff; - u32 next_desc; - u32 orig_len; - u32 orig_buff; - u32 epib[KNAV_DMA_NUM_EPIB_WORDS]; - u32 psdata[KNAV_DMA_NUM_PS_WORDS]; - u32 pad[4]; + __le32 desc_info; + __le32 tag_info; + __le32 packet_info; + __le32 buff_len; + __le32 buff; + __le32 next_desc; + __le32 orig_len; + __le32 orig_buff; + __le32 epib[KNAV_DMA_NUM_EPIB_WORDS]; + __le32 psdata[KNAV_DMA_NUM_PS_WORDS]; + __le32 pad[4]; } ____cacheline_aligned; #if IS_ENABLED(CONFIG_KEYSTONE_NAVIGATOR_DMA) -- cgit v1.2.3 From 26a8145390b36cbe97a5bd0b9e97249f21af6aea Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 10 Dec 2015 17:12:39 +0200 Subject: net/mlx5_core: Introduce flow steering firmware commands Introduce new Flow Steering (FS) firmware commands, in-order to support the new flow steering infrastructure. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 47 +++++++++++++++++++++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 32 ++++++++++++++++++++--------- 2 files changed, 69 insertions(+), 10 deletions(-) create mode 100644 include/linux/mlx5/fs.h (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h new file mode 100644 index 000000000000..34fd8dc0b3e1 --- /dev/null +++ b/include/linux/mlx5/fs.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MLX5_FS_ +#define _MLX5_FS_ + +#include + +struct mlx5_flow_table; + +struct mlx5_flow_destination { + enum mlx5_flow_destination_type type; + union { + u32 tir_num; + struct mlx5_flow_table *ft; + }; +}; +#endif diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f5d94495758a..131a2737cfa3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -256,25 +256,27 @@ struct mlx5_ifc_flow_table_fields_supported_bits { struct mlx5_ifc_flow_table_prop_layout_bits { u8 ft_support[0x1]; - u8 reserved_0[0x1f]; + u8 reserved_0[0x2]; + u8 flow_modify_en[0x1]; + u8 reserved_1[0x1c]; - u8 reserved_1[0x2]; + u8 reserved_2[0x2]; u8 log_max_ft_size[0x6]; - u8 reserved_2[0x10]; + u8 reserved_3[0x10]; u8 max_ft_level[0x8]; - u8 reserved_3[0x20]; + u8 reserved_4[0x20]; - u8 reserved_4[0x18]; + u8 reserved_5[0x18]; u8 log_max_ft_num[0x8]; - u8 reserved_5[0x18]; + u8 reserved_6[0x18]; u8 log_max_destination[0x8]; - u8 reserved_6[0x18]; + u8 reserved_7[0x18]; u8 log_max_flow[0x8]; - u8 reserved_7[0x40]; + u8 reserved_8[0x40]; struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support; @@ -2843,6 +2845,13 @@ struct mlx5_ifc_set_hca_cap_in_bits { union mlx5_ifc_hca_cap_union_bits capability; }; +enum { + MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION = 0x0, + MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_TAG = 0x1, + MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST = 0x2, + MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS = 0x3 +}; + struct mlx5_ifc_set_fte_out_bits { u8 status[0x8]; u8 reserved_0[0x18]; @@ -2867,11 +2876,14 @@ struct mlx5_ifc_set_fte_in_bits { u8 reserved_4[0x8]; u8 table_id[0x18]; - u8 reserved_5[0x40]; + u8 reserved_5[0x18]; + u8 modify_enable_mask[0x8]; + + u8 reserved_6[0x20]; u8 flow_index[0x20]; - u8 reserved_6[0xe0]; + u8 reserved_7[0xe0]; struct mlx5_ifc_flow_context_bits flow_context; }; -- cgit v1.2.3 From 2530236303d9e705db6a28eb9a10c8d79b288b37 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 10 Dec 2015 17:12:43 +0200 Subject: net/mlx5_core: Flow steering tree initialization Flow steering initialization is based on static tree which illustrates the flow steering tree when the driver is loaded. The initialization considers the max supported flow table level of the device, a minimum of 2 kernel flow tables(vlan and mac) are required to have kernel flow table functionality. The tree structures when the driver is loaded: root_namespace(receive nic) | priority-0 (kernel priority) | namespace(kernel namespace) | priority-0 (flow tables priority) In the following patches, When the EN driver will use the flow steering API, it create two flow tables and their flow groups under priority-0(flow tables priority). Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 2 ++ include/linux/mlx5/fs.h | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ac098b6b97bf..2fd7019f69db 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -502,6 +502,8 @@ struct mlx5_priv { struct mlx5_eswitch *eswitch; struct mlx5_core_sriov sriov; unsigned long pci_dev_data; + struct mlx5_flow_root_namespace *root_ns; + struct mlx5_flow_root_namespace *fdb_root_ns; }; enum mlx5_device_state { diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 34fd8dc0b3e1..16ae5233dc7b 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -35,6 +35,13 @@ #include +#define MLX5_FS_DEFAULT_FLOW_TAG 0x0 + +enum mlx5_flow_namespace_type { + MLX5_FLOW_NAMESPACE_KERNEL, + MLX5_FLOW_NAMESPACE_FDB, +}; + struct mlx5_flow_table; struct mlx5_flow_destination { @@ -42,6 +49,7 @@ struct mlx5_flow_destination { union { u32 tir_num; struct mlx5_flow_table *ft; + u32 vport_num; }; }; #endif -- cgit v1.2.3 From 86d722ad2c3bd2f0536b196b7fd67ae2a7e2a492 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 10 Dec 2015 17:12:44 +0200 Subject: net/mlx5: Use flow steering infrastructure for mlx5_en Expose the new flow steering API and remove the old one. Few changes are required: 1. The Ethernet flow steering follows the existing implementation, but uses the new steering API. The old flow steering implementation is removed. 2. Move the E-switch FDB management to use the new API. 3. When driver is loaded call to mlx5_init_fs which initialize the flow steering tree structure, open namespaces for NIC receive and for E-switch FDB. 4. Call to mlx5_cleanup_fs when the driver is unloaded. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/flow_table.h | 63 ----------------------------------------- include/linux/mlx5/fs.h | 38 +++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 63 deletions(-) delete mode 100644 include/linux/mlx5/flow_table.h (limited to 'include') diff --git a/include/linux/mlx5/flow_table.h b/include/linux/mlx5/flow_table.h deleted file mode 100644 index 0f2a15cf3317..000000000000 --- a/include/linux/mlx5/flow_table.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef MLX5_FLOW_TABLE_H -#define MLX5_FLOW_TABLE_H - -#include - -struct mlx5_flow_table_group { - u8 log_sz; - u8 match_criteria_enable; - u32 match_criteria[MLX5_ST_SZ_DW(fte_match_param)]; -}; - -struct mlx5_flow_destination { - enum mlx5_flow_destination_type type; - union { - u32 tir_num; - void *ft; - u32 vport_num; - }; -}; - -void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type, - u16 num_groups, - struct mlx5_flow_table_group *group); -void mlx5_destroy_flow_table(void *flow_table); -int mlx5_add_flow_table_entry(void *flow_table, u8 match_criteria_enable, - void *match_criteria, void *flow_context, - u32 *flow_index); -void mlx5_del_flow_table_entry(void *flow_table, u32 flow_index); -u32 mlx5_get_flow_table_id(void *flow_table); - -#endif /* MLX5_FLOW_TABLE_H */ diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 16ae5233dc7b..bc7ad019afde 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -33,6 +33,7 @@ #ifndef _MLX5_FS_ #define _MLX5_FS_ +#include #include #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 @@ -43,6 +44,9 @@ enum mlx5_flow_namespace_type { }; struct mlx5_flow_table; +struct mlx5_flow_group; +struct mlx5_flow_rule; +struct mlx5_flow_namespace; struct mlx5_flow_destination { enum mlx5_flow_destination_type type; @@ -52,4 +56,38 @@ struct mlx5_flow_destination { u32 vport_num; }; }; + +struct mlx5_flow_namespace * +mlx5_get_flow_namespace(struct mlx5_core_dev *dev, + enum mlx5_flow_namespace_type type); + +struct mlx5_flow_table * +mlx5_create_flow_table(struct mlx5_flow_namespace *ns, + int prio, + int num_flow_table_entries); +int mlx5_destroy_flow_table(struct mlx5_flow_table *ft); + +/* inbox should be set with the following values: + * start_flow_index + * end_flow_index + * match_criteria_enable + * match_criteria + */ +struct mlx5_flow_group * +mlx5_create_flow_group(struct mlx5_flow_table *ft, u32 *in); +void mlx5_destroy_flow_group(struct mlx5_flow_group *fg); + +/* Single destination per rule. + * Group ID is implied by the match criteria. + */ +struct mlx5_flow_rule * +mlx5_add_flow_rule(struct mlx5_flow_table *ft, + u8 match_criteria_enable, + u32 *match_criteria, + u32 *match_value, + u32 action, + u32 flow_tag, + struct mlx5_flow_destination *dest); +void mlx5_del_flow_rule(struct mlx5_flow_rule *fr); + #endif -- cgit v1.2.3 From 369620a09bc5ab867342d51f1820c66b00d78a2c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 10 Dec 2015 12:37:44 -0800 Subject: rco: Clean up casting errors Fixe a couple of cast errors found by sparse. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/checksum.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 9fcaedf994ee..10a16b5bd1c7 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -165,7 +165,8 @@ static inline __wsum remcsum_adjust(void *ptr, __wsum csum, csum = csum_sub(csum, csum_partial(ptr, start, 0)); /* Set derived checksum in packet */ - delta = csum_sub(csum_fold(csum), *psum); + delta = csum_sub((__force __wsum)csum_fold(csum), + (__force __wsum)*psum); *psum = csum_fold(csum); return delta; -- cgit v1.2.3 From abe492b4f50c3ae2ebcfaa2f5c16176aebaa1c68 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 10 Dec 2015 12:37:45 -0800 Subject: geneve: UDP checksum configuration via netlink Add support to enable and disable UDP checksums via netlink. This is similar to how VXLAN and GUE allow this. This includes support for enabling the UDP zero checksum (for both TX and RX). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 5ad57375a99f..2be1dd5a103f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -462,6 +462,9 @@ enum { IFLA_GENEVE_PORT, /* destination port */ IFLA_GENEVE_COLLECT_METADATA, IFLA_GENEVE_REMOTE6, + IFLA_GENEVE_UDP_CSUM, + IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + IFLA_GENEVE_UDP_ZERO_CSUM6_RX, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit v1.2.3 From 19576c9478682a398276c994ea0d2696474df32b Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Wed, 9 Dec 2015 14:07:40 +0100 Subject: netfilter: cttimeout: add netns support Add a per-netns list of timeout objects and adjust code to use it. Signed-off-by: Pablo Neira Ayuso --- include/net/net_namespace.h | 3 +++ include/net/netfilter/nf_conntrack_timeout.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 2dcea635ecce..4089abc6e9c0 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -121,6 +121,9 @@ struct net { #if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT) struct list_head nfnl_acct_list; #endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + struct list_head nfct_timeout_list; +#endif #endif #ifdef CONFIG_WEXT_CORE struct sk_buff_head wext_nlevents; diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index f72be38860a7..5cc5e9e6171a 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -104,7 +104,7 @@ static inline void nf_conntrack_timeout_fini(void) #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -extern struct ctnl_timeout *(*nf_ct_timeout_find_get_hook)(const char *name); +extern struct ctnl_timeout *(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name); extern void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout); #endif -- cgit v1.2.3 From 4ec8ff0edccffe7a77f18e2a1e2ce86f03e08b5c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:54 -0500 Subject: netfilter: prepare xt_cgroup for multi revisions xt_cgroup will grow cgroup2 path based match. Postfix existing symbols with _v0 and prepare for multi revision registration. Signed-off-by: Tejun Heo Cc: Daniel Borkmann Cc: Daniel Wagner CC: Neil Horman Cc: Jan Engelhardt Cc: Pablo Neira Ayuso Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/xt_cgroup.h b/include/uapi/linux/netfilter/xt_cgroup.h index 43acb7e175f6..577c9e0b9406 100644 --- a/include/uapi/linux/netfilter/xt_cgroup.h +++ b/include/uapi/linux/netfilter/xt_cgroup.h @@ -3,7 +3,7 @@ #include -struct xt_cgroup_info { +struct xt_cgroup_info_v0 { __u32 id; __u32 invert; }; -- cgit v1.2.3 From c38c4597e4bf3e99860eac98211748e1ecb0e139 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:55 -0500 Subject: netfilter: implement xt_cgroup cgroup2 path match This patch implements xt_cgroup path match which matches cgroup2 membership of the associated socket. The match is recursive and invertible. For rationales on introducing another cgroup based match, please refer to a preceding commit "sock, cgroup: add sock->sk_cgroup". v3: Folded into xt_cgroup as a new revision interface as suggested by Pablo. v2: Included linux/limits.h from xt_cgroup2.h for PATH_MAX. Added explicit alignment to the priv field. Both suggested by Jan. Signed-off-by: Tejun Heo Cc: Daniel Borkmann Cc: Daniel Wagner CC: Neil Horman Cc: Jan Engelhardt Cc: Pablo Neira Ayuso Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_cgroup.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/xt_cgroup.h b/include/uapi/linux/netfilter/xt_cgroup.h index 577c9e0b9406..1e4b37b93bef 100644 --- a/include/uapi/linux/netfilter/xt_cgroup.h +++ b/include/uapi/linux/netfilter/xt_cgroup.h @@ -2,10 +2,23 @@ #define _UAPI_XT_CGROUP_H #include +#include struct xt_cgroup_info_v0 { __u32 id; __u32 invert; }; +struct xt_cgroup_info_v1 { + __u8 has_path; + __u8 has_classid; + __u8 invert_path; + __u8 invert_classid; + char path[PATH_MAX]; + __u32 classid; + + /* kernel internal data */ + void *priv __attribute__((aligned(8))); +}; + #endif /* _UAPI_XT_CGROUP_H */ -- cgit v1.2.3 From bda13fed677bdb423b97dcf054f68b9eb4c6dbfb Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sun, 13 Dec 2015 16:53:02 +0900 Subject: net: Fix typo in skb_fclone_busy This patch fix a typo found within comment of skb_fclone_busy. Signed-off-by: Masanari Iida Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9b9b9ead7bb3..af4f6ac025b6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -833,7 +833,7 @@ struct sk_buff_fclones { * skb_fclone_busy - check if fclone is busy * @skb: buffer * - * Returns true is skb is a fast clone, and its clone is not freed. + * Returns true if skb is a fast clone, and its clone is not freed. * Some drivers call skb_orphan() in their ndo_start_xmit(), * so we also check that this didnt happen. */ -- cgit v1.2.3 From 6ff64f6f9242d7e50f3e99cb280f69d1927a5fa6 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Dec 2015 16:03:35 +0100 Subject: switchdev: Pass original device to port netdev driver switchdev drivers need to know the netdev on which the switchdev op was invoked. For example, the STP state of a VLAN interface configured on top of a port can change while being member in a bridge. In this case, the underlying driver should only change the STP state of that particular VLAN and not of all the VLANs configured on the port. However, current switchdev infrastructure only passes the port netdev down to the driver. Solve that by passing the original device down to the driver as part of the required switchdev object / attribute. This doesn't entail any change in current switchdev drivers. It simply enables those supporting stacked devices to know the originating device and act accordingly. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/switchdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 1d22ce9f352e..6612946167fe 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -50,6 +50,7 @@ enum switchdev_attr_id { }; struct switchdev_attr { + struct net_device *orig_dev; enum switchdev_attr_id id; u32 flags; union { @@ -68,6 +69,7 @@ enum switchdev_obj_id { }; struct switchdev_obj { + struct net_device *orig_dev; enum switchdev_obj_id id; u32 flags; }; -- cgit v1.2.3 From 55dc5a9f2f2afd32d7b1bda44a5fc95e67a3371f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:40 -0800 Subject: net: Add skb_inner_transport_offset function Same thing as skb_transport_offset but returns the offset of the inner transport header (when skb->encpasulation is set). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index af4f6ac025b6..2393373c9d08 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1939,6 +1939,11 @@ static inline unsigned char *skb_inner_transport_header(const struct sk_buff return skb->head + skb->inner_transport_header; } +static inline int skb_inner_transport_offset(const struct sk_buff *skb) +{ + return skb_inner_transport_header(skb) - skb->data; +} + static inline void skb_reset_inner_transport_header(struct sk_buff *skb) { skb->inner_transport_header = skb->data - skb->head; -- cgit v1.2.3 From 53692b1de419c1b59106909c7f6b4dd3dbc768ac Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:41 -0800 Subject: sctp: Rename NETIF_F_SCTP_CSUM to NETIF_F_SCTP_CRC The SCTP checksum is really a CRC and is very different from the standards 1's complement checksum that serves as the checksum for IP protocols. This offload interface is also very different. Rename NETIF_F_SCTP_CSUM to NETIF_F_SCTP_CRC to highlight these differences. The term CSUM should be reserved in the stack to refer to the standard 1's complement IP checksum. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index f0d87347df19..6395f8309393 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -52,7 +52,7 @@ enum { NETIF_F_GSO_TUNNEL_REMCSUM_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ - NETIF_F_SCTP_CSUM_BIT, /* SCTP checksum offload */ + NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ NETIF_F_FCOE_MTU_BIT, /* Supports max FCoE MTU, 2158 bytes*/ NETIF_F_NTUPLE_BIT, /* N-tuple filters supported */ NETIF_F_RXHASH_BIT, /* Receive hashing offload */ @@ -103,7 +103,7 @@ enum { #define NETIF_F_NTUPLE __NETIF_F(NTUPLE) #define NETIF_F_RXCSUM __NETIF_F(RXCSUM) #define NETIF_F_RXHASH __NETIF_F(RXHASH) -#define NETIF_F_SCTP_CSUM __NETIF_F(SCTP_CSUM) +#define NETIF_F_SCTP_CRC __NETIF_F(SCTP_CRC) #define NETIF_F_SG __NETIF_F(SG) #define NETIF_F_TSO6 __NETIF_F(TSO6) #define NETIF_F_TSO_ECN __NETIF_F(TSO_ECN) -- cgit v1.2.3 From a188222b6ed29404ac2d4232d35d1fe0e77af370 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:43 -0800 Subject: net: Rename NETIF_F_ALL_CSUM to NETIF_F_CSUM_MASK The name NETIF_F_ALL_CSUM is a misnomer. This does not correspond to the set of features for offloading all checksums. This is a mask of the checksum offload related features bits. It is incorrect to set both NETIF_F_HW_CSUM and NETIF_F_IP_CSUM or NETIF_F_IPV6 at the same time for features of a device. This patch: - Changes instances of NETIF_F_ALL_CSUM to NETIF_F_CSUM_MASK (where NETIF_F_ALL_CSUM is being used as a mask). - Changes bonding, sfc/efx, ipvlan, macvlan, vlan, and team drivers to use NEITF_F_HW_CSUM in features list instead of NETIF_F_ALL_CSUM. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 7 ++++++- include/linux/netdevice.h | 6 +++--- include/net/vxlan.h | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 6395f8309393..2c4e94ab88da 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -149,7 +149,12 @@ enum { #define NETIF_F_GEN_CSUM NETIF_F_HW_CSUM #define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM) #define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM) -#define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM) + +/* List of IP checksum features. Note that NETIF_HW_CSUM should not be + * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set-- + * this would be contradictory + */ +#define NETIF_F_CSUM_MASK (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM) #define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1bb21ff0fa64..a54223a113b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3763,12 +3763,12 @@ static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { if (f1 & NETIF_F_GEN_CSUM) - f1 |= (NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + f1 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); if (f2 & NETIF_F_GEN_CSUM) - f2 |= (NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + f2 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); f1 &= f2; if (f1 & NETIF_F_GEN_CSUM) - f1 &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + f1 &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); return f1; } diff --git a/include/net/vxlan.h b/include/net/vxlan.h index c1c899c3a51b..b5a1aec1a167 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -232,7 +232,7 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, skb->inner_protocol != htons(ETH_P_TEB) || (skb_inner_mac_header(skb) - skb_transport_header(skb) != sizeof(struct udphdr) + sizeof(struct vxlanhdr)))) - return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK); + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); return features; } -- cgit v1.2.3 From c8cd0989bd151fda87bbf10887b3df18021284bc Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:44 -0800 Subject: net: Eliminate NETIF_F_GEN_CSUM and NETIF_F_V[46]_CSUM These netif flags are unnecessary convolutions. It is more straightforward to just use NETIF_F_HW_CSUM, NETIF_F_IP_CSUM, and NETIF_F_IPV6_CSUM directly. This patch also: - Cleans up can_checksum_protocol - Simplifies netdev_intersect_features Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 2 +- include/linux/netdev_features.h | 9 +++------ include/linux/netdevice.h | 40 +++++++++++++++++++++++++--------------- 3 files changed, 29 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 05f5879821b8..a5f6ce6b578c 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -621,7 +621,7 @@ static inline netdev_features_t vlan_features_check(const struct sk_buff *skb, NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | + NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 2c4e94ab88da..d9654f0eecb3 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -146,15 +146,12 @@ enum { #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | \ NETIF_F_TSO6 | NETIF_F_UFO) -#define NETIF_F_GEN_CSUM NETIF_F_HW_CSUM -#define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM) -#define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM) - -/* List of IP checksum features. Note that NETIF_HW_CSUM should not be +/* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set-- * this would be contradictory */ -#define NETIF_F_CSUM_MASK (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM) +#define NETIF_F_CSUM_MASK (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \ + NETIF_F_HW_CSUM) #define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a54223a113b1..283984b67cd9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3691,13 +3691,24 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth); static inline bool can_checksum_protocol(netdev_features_t features, __be16 protocol) { - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_V4_CSUM) && - protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_V6_CSUM) && - protocol == htons(ETH_P_IPV6)) || - ((features & NETIF_F_FCOE_CRC) && - protocol == htons(ETH_P_FCOE))); + if (protocol == htons(ETH_P_FCOE)) + return !!(features & NETIF_F_FCOE_CRC); + + /* Assume this is an IP checksum (not SCTP CRC) */ + + if (features & NETIF_F_HW_CSUM) { + /* Can checksum everything */ + return true; + } + + switch (protocol) { + case htons(ETH_P_IP): + return !!(features & NETIF_F_IP_CSUM); + case htons(ETH_P_IPV6): + return !!(features & NETIF_F_IPV6_CSUM); + default: + return false; + } } #ifdef CONFIG_BUG @@ -3762,15 +3773,14 @@ void linkwatch_run_queue(void); static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { - if (f1 & NETIF_F_GEN_CSUM) - f1 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); - if (f2 & NETIF_F_GEN_CSUM) - f2 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); - f1 &= f2; - if (f1 & NETIF_F_GEN_CSUM) - f1 &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); + if ((f1 ^ f2) & NETIF_F_HW_CSUM) { + if (f1 & NETIF_F_HW_CSUM) + f1 |= (NETIF_F_IP_CSUM|NETIF_F_IP_CSUM); + else + f2 |= (NETIF_F_IP_CSUM|NETIF_F_IP_CSUM); + } - return f1; + return f1 & f2; } static inline netdev_features_t netdev_get_wanted_features( -- cgit v1.2.3 From 9a49850d0af7b9fd14d091dfe61ef6cb369f86b9 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:45 -0800 Subject: tcp: Fix conditions to determine checksum offload In tcp_send_sendpage and tcp_sendmsg we check the route capabilities to determine if checksum offload can be performed. This check currently does not take the IP protocol into account for devices that advertise only one of NETIF_F_IPV6_CSUM or NETIF_F_IP_CSUM. This patch adds a function to check capabilities for checksum offload with a socket called sk_check_csum_caps. This function checks for specific IPv4 or IPv6 offload support based on the family of the socket. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/sock.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 0ca22b014de1..ab0269f4b2cc 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1791,6 +1791,15 @@ static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags) sk->sk_route_caps &= ~flags; } +static inline bool sk_check_csum_caps(struct sock *sk) +{ + return (sk->sk_route_caps & NETIF_F_HW_CSUM) || + (sk->sk_family == PF_INET && + (sk->sk_route_caps & NETIF_F_IP_CSUM)) || + (sk->sk_family == PF_INET6 && + (sk->sk_route_caps & NETIF_F_IPV6_CSUM)); +} + static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, char *to, int copy, int offset) -- cgit v1.2.3 From 6ae23ad36253a8033c5714c52b691b84456487c5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:46 -0800 Subject: net: Add driver helper functions to determine checksum offloadability Add skb_csum_offload_chk driver helper function to determine if a device with limited checksum offload capabilities is able to offload the checksum for a given packet. This patch includes: - The skb_csum_offload_chk function. Returns true if checksum is offloadable, else false. Optionally, in the case that the checksum is not offloable, the function can call skb_checksum_help to resolve the checksum. skb_csum_offload_chk also returns whether the checksum refers to an encapsulated checksum. - Definition of skb_csum_offl_spec structure that caller uses to indicate rules about what it can offload (e.g. IPv4/v6, TCP/UDP only, whether encapsulated checksums can be offloaded, whether checksum with IPv6 extension headers can be offloaded). - Ancilary functions called skb_csum_offload_chk_help, skb_csum_off_chk_help_cmn, skb_csum_off_chk_help_cmn_v4_only. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 78 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 283984b67cd9..9fb6395967de 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2522,6 +2522,71 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, remcsum_unadjust((__sum16 *)ptr, grc->delta); } +struct skb_csum_offl_spec { + __u16 ipv4_okay:1, + ipv6_okay:1, + encap_okay:1, + ip_options_okay:1, + ext_hdrs_okay:1, + tcp_okay:1, + udp_okay:1, + sctp_okay:1, + vlan_okay:1, + no_encapped_ipv6:1, + no_not_encapped:1; +}; + +bool __skb_csum_offload_chk(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec, + bool *csum_encapped, + bool csum_help); + +static inline bool skb_csum_offload_chk(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec, + bool *csum_encapped, + bool csum_help) +{ + if (skb->ip_summed != CHECKSUM_PARTIAL) + return false; + + return __skb_csum_offload_chk(skb, spec, csum_encapped, csum_help); +} + +static inline bool skb_csum_offload_chk_help(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec) +{ + bool csum_encapped; + + return skb_csum_offload_chk(skb, spec, &csum_encapped, true); +} + +static inline bool skb_csum_off_chk_help_cmn(struct sk_buff *skb) +{ + static const struct skb_csum_offl_spec csum_offl_spec = { + .ipv4_okay = 1, + .ip_options_okay = 1, + .ipv6_okay = 1, + .vlan_okay = 1, + .tcp_okay = 1, + .udp_okay = 1, + }; + + return skb_csum_offload_chk_help(skb, &csum_offl_spec); +} + +static inline bool skb_csum_off_chk_help_cmn_v4_only(struct sk_buff *skb) +{ + static const struct skb_csum_offl_spec csum_offl_spec = { + .ipv4_okay = 1, + .ip_options_okay = 1, + .tcp_okay = 1, + .udp_okay = 1, + .vlan_okay = 1, + }; + + return skb_csum_offload_chk_help(skb, &csum_offl_spec); +} + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, @@ -3711,6 +3776,19 @@ static inline bool can_checksum_protocol(netdev_features_t features, } } +/* Map an ethertype into IP protocol if possible */ +static inline int eproto_to_ipproto(int eproto) +{ + switch (eproto) { + case htons(ETH_P_IP): + return IPPROTO_IP; + case htons(ETH_P_IPV6): + return IPPROTO_IPV6; + default: + return -1; + } +} + #ifdef CONFIG_BUG void netdev_rx_csum_fault(struct net_device *dev); #else -- cgit v1.2.3 From 7a6ae71b2490586ed55105893a18dfc648e5fcbb Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:47 -0800 Subject: net: Elaborate on checksum offload interface description Add specifics and details the description of the interface between the stack and drivers for doing checksum offload. This description is meant to be as specific and complete as possible. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 138 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 109 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2393373c9d08..6b6bd42d6134 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -39,11 +39,55 @@ #include #include -/* A. Checksumming of received packets by device. +/* The interface for checksum offload between the stack and networking drivers + * is as follows... + * + * A. IP checksum related features + * + * Drivers advertise checksum offload capabilities in the features of a device. + * From the stack's point of view these are capabilities offered by the driver, + * a driver typically only advertises features that it is capable of offloading + * to its device. + * + * The checksum related features are: + * + * NETIF_F_HW_CSUM - The driver (or its device) is able to compute one + * IP (one's complement) checksum for any combination + * of protocols or protocol layering. The checksum is + * computed and set in a packet per the CHECKSUM_PARTIAL + * interface (see below). + * + * NETIF_F_IP_CSUM - Driver (device) is only able to checksum plain + * TCP or UDP packets over IPv4. These are specifically + * unencapsulated packets of the form IPv4|TCP or + * IPv4|UDP where the Protocol field in the IPv4 header + * is TCP or UDP. The IPv4 header may contain IP options + * This feature cannot be set in features for a device + * with NETIF_F_HW_CSUM also set. This feature is being + * DEPRECATED (see below). + * + * NETIF_F_IPV6_CSUM - Driver (device) is only able to checksum plain + * TCP or UDP packets over IPv6. These are specifically + * unencapsulated packets of the form IPv6|TCP or + * IPv4|UDP where the Next Header field in the IPv6 + * header is either TCP or UDP. IPv6 extension headers + * are not supported with this feature. This feature + * cannot be set in features for a device with + * NETIF_F_HW_CSUM also set. This feature is being + * DEPRECATED (see below). + * + * NETIF_F_RXCSUM - Driver (device) performs receive checksum offload. + * This flag is used only used to disable the RX checksum + * feature for a device. The stack will accept receive + * checksum indication in packets received on a device + * regardless of whether NETIF_F_RXCSUM is set. + * + * B. Checksumming of received packets by device. Indication of checksum + * verification is in set skb->ip_summed. Possible values are: * * CHECKSUM_NONE: * - * Device failed to checksum this packet e.g. due to lack of capabilities. + * Device did not checksum this packet e.g. due to lack of capabilities. * The packet contains full (though not verified) checksum in packet but * not in skb->csum. Thus, skb->csum is undefined in this case. * @@ -53,9 +97,8 @@ * (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums * for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY * if their checksums are okay. skb->csum is still undefined in this case - * though. It is a bad option, but, unfortunately, nowadays most vendors do - * this. Apparently with the secret goal to sell you new devices, when you - * will add new protocol to your host, f.e. IPv6 8) + * though. A driver or device must never modify the checksum field in the + * packet even if checksum is verified. * * CHECKSUM_UNNECESSARY is applicable to following protocols: * TCP: IPv6 and IPv4. @@ -96,40 +139,77 @@ * packet that are after the checksum being offloaded are not considered to * be verified. * - * B. Checksumming on output. - * - * CHECKSUM_NONE: - * - * The skb was already checksummed by the protocol, or a checksum is not - * required. + * C. Checksumming on transmit for non-GSO. The stack requests checksum offload + * in the skb->ip_summed for a packet. Values are: * * CHECKSUM_PARTIAL: * - * The device is required to checksum the packet as seen by hard_start_xmit() + * The driver is required to checksum the packet as seen by hard_start_xmit() * from skb->csum_start up to the end, and to record/write the checksum at - * offset skb->csum_start + skb->csum_offset. + * offset skb->csum_start + skb->csum_offset. A driver may verify that the + * csum_start and csum_offset values are valid values given the length and + * offset of the packet, however they should not attempt to validate that the + * checksum refers to a legitimate transport layer checksum-- it is the + * purview of the stack to validate that csum_start and csum_offset are set + * correctly. + * + * When the stack requests checksum offload for a packet, the driver MUST + * ensure that the checksum is set correctly. A driver can either offload the + * checksum calculation to the device, or call skb_checksum_help (in the case + * that the device does not support offload for a particular checksum). + * + * NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM are being deprecated in favor of + * NETIF_F_HW_CSUM. New devices should use NETIF_F_HW_CSUM to indicate + * checksum offload capability. If a device has limited checksum capabilities + * (for instance can only perform NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM as + * described above) a helper function can be called to resolve + * CHECKSUM_PARTIAL. The helper functions are skb_csum_off_chk*. The helper + * function takes a spec argument that describes the protocol layer that is + * supported for checksum offload and can be called for each packet. If a + * packet does not match the specification for offload, skb_checksum_help + * is called to resolve the checksum. * - * The device must show its capabilities in dev->features, set up at device - * setup time, e.g. netdev_features.h: + * CHECKSUM_NONE: * - * NETIF_F_HW_CSUM - It's a clever device, it's able to checksum everything. - * NETIF_F_IP_CSUM - Device is dumb, it's able to checksum only TCP/UDP over - * IPv4. Sigh. Vendors like this way for an unknown reason. - * Though, see comment above about CHECKSUM_UNNECESSARY. 8) - * NETIF_F_IPV6_CSUM - About as dumb as the last one but does IPv6 instead. - * NETIF_F_... - Well, you get the picture. + * The skb was already checksummed by the protocol, or a checksum is not + * required. * * CHECKSUM_UNNECESSARY: * - * Normally, the device will do per protocol specific checksumming. Protocol - * implementations that do not want the NIC to perform the checksum - * calculation should use this flag in their outgoing skbs. + * This has the same meaning on as CHECKSUM_NONE for checksum offload on + * output. * - * NETIF_F_FCOE_CRC - This indicates that the device can do FCoE FC CRC - * offload. Correspondingly, the FCoE protocol driver - * stack should use CHECKSUM_UNNECESSARY. - * - * Any questions? No questions, good. --ANK + * CHECKSUM_COMPLETE: + * Not used in checksum output. If a driver observes a packet with this value + * set in skbuff, if should treat as CHECKSUM_NONE being set. + * + * D. Non-IP checksum (CRC) offloads + * + * NETIF_F_SCTP_CRC - This feature indicates that a device is capable of + * offloading the SCTP CRC in a packet. To perform this offload the stack + * will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset + * accordingly. Note the there is no indication in the skbuff that the + * CHECKSUM_PARTIAL refers to an SCTP checksum, a driver that supports + * both IP checksum offload and SCTP CRC offload must verify which offload + * is configured for a packet presumably by inspecting packet headers. + * + * NETIF_F_FCOE_CRC - This feature indicates that a device is capable of + * offloading the FCOE CRC in a packet. To perform this offload the stack + * will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset + * accordingly. Note the there is no indication in the skbuff that the + * CHECKSUM_PARTIAL refers to an FCOE checksum, a driver that supports + * both IP checksum offload and FCOE CRC offload must verify which offload + * is configured for a packet presumably by inspecting packet headers. + * + * E. Checksumming on output with GSO. + * + * In the case of a GSO packet (skb_is_gso(skb) is true), checksum offload + * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the + * gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as + * part of the GSO operation is implied. If a checksum is being offloaded + * with GSO then ip_summed is CHECKSUM_PARTIAL, csum_start and csum_offset + * are set to refer to the outermost checksum being offload (two offloaded + * checksums are possible with UDP encapsulation). */ /* Don't change this without changing skb_csum_unnecessary! */ -- cgit v1.2.3 From 3502cad73c4bbf8f6365d539e814159275252c59 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 15 Dec 2015 15:41:36 -0800 Subject: rhashtable: add function to replace an element Add the rhashtable_replace_fast function. This replaces one object in the table with another atomically. The hashes of the new and old objects must be equal. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 843ceca9a21e..77deece15fb3 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -819,4 +819,86 @@ out: return err; } +/* Internal function, please use rhashtable_replace_fast() instead */ +static inline int __rhashtable_replace_fast( + struct rhashtable *ht, struct bucket_table *tbl, + struct rhash_head *obj_old, struct rhash_head *obj_new, + const struct rhashtable_params params) +{ + struct rhash_head __rcu **pprev; + struct rhash_head *he; + spinlock_t *lock; + unsigned int hash; + int err = -ENOENT; + + /* Minimally, the old and new objects must have same hash + * (which should mean identifiers are the same). + */ + hash = rht_head_hashfn(ht, tbl, obj_old, params); + if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) + return -EINVAL; + + lock = rht_bucket_lock(tbl, hash); + + spin_lock_bh(lock); + + pprev = &tbl->buckets[hash]; + rht_for_each(he, tbl, hash) { + if (he != obj_old) { + pprev = &he->next; + continue; + } + + rcu_assign_pointer(obj_new->next, obj_old->next); + rcu_assign_pointer(*pprev, obj_new); + err = 0; + break; + } + + spin_unlock_bh(lock); + + return err; +} + +/** + * rhashtable_replace_fast - replace an object in hash table + * @ht: hash table + * @obj_old: pointer to hash head inside object being replaced + * @obj_new: pointer to hash head inside object which is new + * @params: hash table parameters + * + * Replacing an object doesn't affect the number of elements in the hash table + * or bucket, so we don't need to worry about shrinking or expanding the + * table here. + * + * Returns zero on success, -ENOENT if the entry could not be found, + * -EINVAL if hash is not the same for the old and new objects. + */ +static inline int rhashtable_replace_fast( + struct rhashtable *ht, struct rhash_head *obj_old, + struct rhash_head *obj_new, + const struct rhashtable_params params) +{ + struct bucket_table *tbl; + int err; + + rcu_read_lock(); + + tbl = rht_dereference_rcu(ht->tbl, ht); + + /* Because we have already taken (and released) the bucket + * lock in old_tbl, if we find that future_tbl is not yet + * visible then that guarantees the entry to still be in + * the old tbl if it exists. + */ + while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, + obj_new, params)) && + (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) + ; + + rcu_read_unlock(); + + return err; +} + #endif /* _LINUX_RHASHTABLE_H */ -- cgit v1.2.3 From fc9e50f5a5a4e1fa9ba2756f745a13e693cf6a06 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 15 Dec 2015 15:41:37 -0800 Subject: netlink: add a start callback for starting a netlink dump The start callback allows the caller to set up a context for the dump callbacks. Presumably, the context can then be destroyed in the done callback. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 ++ include/net/genetlink.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 639e9b8b0e4d..0b41959aab9f 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -131,6 +131,7 @@ netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; + int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); @@ -153,6 +154,7 @@ struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags); struct netlink_dump_control { + int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *skb, struct netlink_callback *); int (*done)(struct netlink_callback *); void *data; diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 1b6b6dcb018d..43c0e771f417 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -114,6 +114,7 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net) * @flags: flags * @policy: attribute validation policy * @doit: standard command callback + * @start: start callback for dumps * @dumpit: callback for dumpers * @done: completion callback for dumps * @ops_list: operations list @@ -122,6 +123,7 @@ struct genl_ops { const struct nla_policy *policy; int (*doit)(struct sk_buff *skb, struct genl_info *info); + int (*start)(struct netlink_callback *cb); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); -- cgit v1.2.3 From 7f00feaf107645d95a6d87e99b4d141ac0a08efd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 15 Dec 2015 15:41:38 -0800 Subject: ila: Add generic ILA translation facility This patch implements an ILA tanslation table. This table can be configured with identifier to locator mappings, and can be be queried to resolve a mapping. Queries can be parameterized based on interface, direction (incoming or outoing), and matching locator. The table is implemented using rhashtable and is configured via netlink (through "ip ila .." in iproute). The table may be used as alternative means to do do ILA tanslations other than the lw tunnels Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ila.h | 18 ++++++++++++++++++ include/uapi/linux/ila.h | 22 ++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 include/net/ila.h (limited to 'include') diff --git a/include/net/ila.h b/include/net/ila.h new file mode 100644 index 000000000000..9f4f43e94ae4 --- /dev/null +++ b/include/net/ila.h @@ -0,0 +1,18 @@ +/* + * ILA kernel interface + * + * Copyright (c) 2015 Tom Herbert + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + */ + +#ifndef _NET_ILA_H +#define _NET_ILA_H + +int ila_xlat_outgoing(struct sk_buff *skb); +int ila_xlat_incoming(struct sk_buff *skb); + +#endif /* _NET_ILA_H */ diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h index 7ed9e670814e..abde7bbd6f3b 100644 --- a/include/uapi/linux/ila.h +++ b/include/uapi/linux/ila.h @@ -3,13 +3,35 @@ #ifndef _UAPI_LINUX_ILA_H #define _UAPI_LINUX_ILA_H +/* NETLINK_GENERIC related info */ +#define ILA_GENL_NAME "ila" +#define ILA_GENL_VERSION 0x1 + enum { ILA_ATTR_UNSPEC, ILA_ATTR_LOCATOR, /* u64 */ + ILA_ATTR_IDENTIFIER, /* u64 */ + ILA_ATTR_LOCATOR_MATCH, /* u64 */ + ILA_ATTR_IFINDEX, /* s32 */ + ILA_ATTR_DIR, /* u32 */ __ILA_ATTR_MAX, }; #define ILA_ATTR_MAX (__ILA_ATTR_MAX - 1) +enum { + ILA_CMD_UNSPEC, + ILA_CMD_ADD, + ILA_CMD_DEL, + ILA_CMD_GET, + + __ILA_CMD_MAX, +}; + +#define ILA_CMD_MAX (__ILA_CMD_MAX - 1) + +#define ILA_DIR_IN (1 << 0) +#define ILA_DIR_OUT (1 << 1) + #endif /* _UAPI_LINUX_ILA_H */ -- cgit v1.2.3 From b613f56ec9baf30edf5d9d607b822532a273dad7 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 16 Dec 2015 12:30:02 +0900 Subject: net: diag: split inet_diag_dump_one_icsk into two Currently, inet_diag_dump_one_icsk finds a socket and then dumps its information to userspace. Split it into a part that finds the socket and a part that dumps the information. Signed-off-by: Lorenzo Colitti Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 0e707f0c1a3e..e7032f041982 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -3,6 +3,7 @@ #include +struct net; struct sock; struct inet_hashinfo; struct nlattr; @@ -41,6 +42,10 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, const struct nlmsghdr *nlh, const struct inet_diag_req_v2 *req); +struct sock *inet_diag_find_one_icsk(struct net *net, + struct inet_hashinfo *hashinfo, + const struct inet_diag_req_v2 *req); + int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); extern int inet_diag_register(const struct inet_diag_handler *handler); -- cgit v1.2.3 From 64be0aed59ad519d6f2160868734f7e278290ac1 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 16 Dec 2015 12:30:03 +0900 Subject: net: diag: Add the ability to destroy a socket. This patch adds a SOCK_DESTROY operation, a destroy function pointer to sock_diag_handler, and a diag_destroy function pointer. It does not include any implementation code. Signed-off-by: Lorenzo Colitti Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/sock_diag.h | 2 ++ include/net/sock.h | 1 + include/uapi/linux/sock_diag.h | 1 + 3 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index fddebc617469..4018b48f2b3b 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -15,6 +15,7 @@ struct sock_diag_handler { __u8 family; int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh); int (*get_info)(struct sk_buff *skb, struct sock *sk); + int (*destroy)(struct sk_buff *skb, struct nlmsghdr *nlh); }; int sock_diag_register(const struct sock_diag_handler *h); @@ -68,4 +69,5 @@ bool sock_diag_has_destroy_listeners(const struct sock *sk) } void sock_diag_broadcast_destroy(struct sock *sk); +int sock_diag_destroy(struct sock *sk, int err); #endif diff --git a/include/net/sock.h b/include/net/sock.h index ab0269f4b2cc..6e6e8a25d997 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1060,6 +1060,7 @@ struct proto { void (*destroy_cgroup)(struct mem_cgroup *memcg); struct cg_proto *(*proto_cgroup)(struct mem_cgroup *memcg); #endif + int (*diag_destroy)(struct sock *sk, int err); }; int proto_register(struct proto *prot, int alloc_slab); diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h index 49230d36f9ce..bae2d80034d4 100644 --- a/include/uapi/linux/sock_diag.h +++ b/include/uapi/linux/sock_diag.h @@ -4,6 +4,7 @@ #include #define SOCK_DIAG_BY_FAMILY 20 +#define SOCK_DESTROY 21 struct sock_diag_req { __u8 sdiag_family; -- cgit v1.2.3 From 6eb5d2e08f071c05ecbe135369c9ad418826cab2 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 16 Dec 2015 12:30:04 +0900 Subject: net: diag: Support SOCK_DESTROY for inet sockets. This passes the SOCK_DESTROY operation to the underlying protocol diag handler, or returns -EOPNOTSUPP if that handler does not define a destroy operation. Most of this patch is just renaming functions. This is not strictly necessary, but it would be fairly counterintuitive to have the code to destroy inet sockets be in a function whose name starts with inet_diag_get. Signed-off-by: Lorenzo Colitti Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index e7032f041982..7c27fa1030e8 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -24,6 +24,10 @@ struct inet_diag_handler { void (*idiag_get_info)(struct sock *sk, struct inet_diag_msg *r, void *info); + + int (*destroy)(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req); + __u16 idiag_type; __u16 idiag_info_size; }; -- cgit v1.2.3 From c1e64e298b8cad309091b95d8436a0255c84f54a Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 16 Dec 2015 12:30:05 +0900 Subject: net: diag: Support destroying TCP sockets. This implements SOCK_DESTROY for TCP sockets. It causes all blocking calls on the socket to fail fast with ECONNABORTED and causes a protocol close of the socket. It informs the other end of the connection by sending a RST, i.e., initiating a TCP ABORT as per RFC 793. ECONNABORTED was chosen for consistency with FreeBSD. Signed-off-by: Lorenzo Colitti Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index f80e74c5ad18..3077735b348d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1170,6 +1170,8 @@ void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); +int tcp_abort(struct sock *sk, int err); + static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) { rx_opt->dsack = 0; -- cgit v1.2.3 From 541c9a84cd85203244307d9ebb821102eed82789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 9 Dec 2015 23:36:51 +0100 Subject: ssb: pick SoC invariants code from MIPS BCM47xx arch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is code in ssb fetching "invariants" that is basically a set of board specific data. Every host requires its own implementation of reading function. In ssb we have support for PCI, PCMCIA & SDIO. For some (historical?) reason code reading "invariants" for SoC was placed in arch code and provided by a callback. This is not needed nowadays, so lets move that into ssb. This way we keep all "invariants" functions in a single module making code cleaner. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/ssb/ssb.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index c3d1a525bacc..26a0b3c3ce5f 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -524,13 +524,9 @@ struct ssb_init_invariants { typedef int (*ssb_invariants_func_t)(struct ssb_bus *bus, struct ssb_init_invariants *iv); -/* Register a SSB system bus. get_invariants() is called after the - * basic system devices are initialized. - * The invariants are usually fetched from some NVRAM. - * Put the invariants into the struct pointed to by iv. */ -extern int ssb_bus_ssbbus_register(struct ssb_bus *bus, - unsigned long baseaddr, - ssb_invariants_func_t get_invariants); +/* Register SoC bus. */ +extern int ssb_bus_host_soc_register(struct ssb_bus *bus, + unsigned long baseaddr); #ifdef CONFIG_SSB_PCIHOST extern int ssb_bus_pcibus_register(struct ssb_bus *bus, struct pci_dev *host_pci); -- cgit v1.2.3 From 566178f853c1aa57be9c16007c7cca07df5d51b6 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Wed, 16 Dec 2015 13:55:04 +0800 Subject: net: sctp: dynamically enable or disable pf state As we all know, the value of pf_retrans >= max_retrans_path can disable pf state. The variables of pf_retrans and max_retrans_path can be changed by the userspace application. Sometimes the user expects to disable pf state while the 2 variables are changed to enable pf state. So it is necessary to introduce a new variable to disable pf state. According to the suggestions from Vlad Yasevich, extra1 and extra2 are removed. The initialization of pf_enable is added. Acked-by: Vlad Yasevich Signed-off-by: Zhu Yanjun Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/netns/sctp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index 8ba379f9e467..c501d67172b1 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -88,6 +88,13 @@ struct netns_sctp { */ int pf_retrans; + /* + * Disable Potentially-Failed feature, the feature is enabled by default + * pf_enable - 0 : disable pf + * - >0 : enable pf + */ + int pf_enable; + /* * Policy for preforming sctp/socket accounting * 0 - do socket level accounting, all assocs share sk_sndbuf -- cgit v1.2.3 From a8170d2b9e8d38a1f3fa3b40b6f8cd34a87d5382 Mon Sep 17 00:00:00 2001 From: "Singhai, Anjali" Date: Mon, 14 Dec 2015 12:21:17 -0800 Subject: geneve: Add geneve udp port offload for ethernet devices Add ndo_ops to add/del UDP ports to a device that supports geneve offload. v2: Comment fix. Signed-off-by: Anjali Singhai Jain Signed-off-by: Kiran Patil Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9fb6395967de..81b26a543a3c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1013,6 +1013,19 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * a new port starts listening. The operation is protected by the * vxlan_net->sock_lock. * + * void (*ndo_add_geneve_port)(struct net_device *dev, + * sa_family_t sa_family, __be16 port); + * Called by geneve to notify a driver about the UDP port and socket + * address family that geneve is listnening to. It is called only when + * a new port starts listening. The operation is protected by the + * geneve_net->sock_lock. + * + * void (*ndo_del_geneve_port)(struct net_device *dev, + * sa_family_t sa_family, __be16 port); + * Called by geneve to notify the driver about a UDP port and socket + * address family that geneve is not listening to anymore. The operation + * is protected by the geneve_net->sock_lock. + * * void (*ndo_del_vxlan_port)(struct net_device *dev, * sa_family_t sa_family, __be16 port); * Called by vxlan to notify the driver about a UDP port and socket @@ -1217,7 +1230,12 @@ struct net_device_ops { void (*ndo_del_vxlan_port)(struct net_device *dev, sa_family_t sa_family, __be16 port); - + void (*ndo_add_geneve_port)(struct net_device *dev, + sa_family_t sa_family, + __be16 port); + void (*ndo_del_geneve_port)(struct net_device *dev, + sa_family_t sa_family, + __be16 port); void* (*ndo_dfwd_add_station)(struct net_device *pdev, struct net_device *dev); void (*ndo_dfwd_del_station)(struct net_device *pdev, -- cgit v1.2.3 From 05ca4029b25c65c860be5baa9288ec8f7e0a97e6 Mon Sep 17 00:00:00 2001 From: "Singhai, Anjali" Date: Mon, 14 Dec 2015 12:21:20 -0800 Subject: geneve: Add geneve_get_rx_port support This patch adds an op that the drivers can call into to get existing geneve ports. Signed-off-by: Anjali Singhai Jain Signed-off-by: David S. Miller --- include/net/geneve.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/geneve.h b/include/net/geneve.h index 3106ed6eae0d..e6c23dc765f7 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -62,6 +62,14 @@ struct genevehdr { struct geneve_opt options[]; }; +#if IS_ENABLED(CONFIG_GENEVE) +void geneve_get_rx_port(struct net_device *netdev); +#else +static inline void geneve_get_rx_port(struct net_device *netdev) +{ +} +#endif + #ifdef CONFIG_INET struct net_device *geneve_dev_create_fb(struct net *net, const char *name, u8 name_assign_type, u16 dst_port); -- cgit v1.2.3 From 32bc201e1974976b7d3fea9a9b17bb7392ca6394 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 16 Dec 2015 17:50:11 +0800 Subject: ipv6: allow routes to be configured with expire values Add the support for adding expire value to routes, requested by Tom Gundersen for systemd-networkd, and NetworkManager wants it too. implement it by adding the new RTNETLINK attribute RTA_EXPIRES. Signed-off-by: Xin Long Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 123a5af4e8bb..ca764b5da86d 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -311,6 +311,7 @@ enum rtattr_type_t { RTA_PREF, RTA_ENCAP_TYPE, RTA_ENCAP, + RTA_EXPIRES, __RTA_MAX }; -- cgit v1.2.3 From 715f504b118998c41a2079a17e16bf5a8a114885 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 16 Dec 2015 17:22:47 +0100 Subject: ipv6: add IPV6_HDRINCL option for raw sockets Same as in Windows, we miss IPV6_HDRINCL for SOL_IPV6 and SOL_RAW. The SOL_IP/IP_HDRINCL is not available for IPv6 sockets. Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/in6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 79b12b004ade..318a4828bf98 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -196,6 +196,7 @@ struct in6_flowlabel_req { #define IPV6_IPSEC_POLICY 34 #define IPV6_XFRM_POLICY 35 +#define IPV6_HDRINCL 36 #endif /* -- cgit v1.2.3 From cc9da6cc4f56e05cc9e591459fe0192727ff58b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 16 Dec 2015 16:44:38 +0100 Subject: ipv6: addrconf: use stable address generator for ARPHRD_NONE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new address generator mode, using the stable address generator with an automatically generated secret. This is intended as a default address generator mode for device types with no EUI64 implementation. The new generator is used for ARPHRD_NONE interfaces initially, adding default IPv6 autoconf support to e.g. tun interfaces. If the addrgenmode is set to 'random', either by default or manually, and no stable secret is available, then a random secret is used as input for the stable-privacy address generator. The secret can be read and modified like manually configured secrets, using the proc interface. Modifying the secret will change the addrgen mode to 'stable-privacy' to indicate that it operates on a known secret. Existing behaviour of the 'stable-privacy' mode is kept unchanged. If a known secret is available when the device is created, then the mode will default to 'stable-privacy' as before. The mode can be manually set to 'random' but it will behave exactly like 'stable-privacy' in this case. The secret will not change. Cc: Hannes Frederic Sowa Cc: 吉藤英明 Signed-off-by: Bjørn Mork Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2be1dd5a103f..a30b78090594 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -218,6 +218,7 @@ enum in6_addr_gen_mode { IN6_ADDR_GEN_MODE_EUI64, IN6_ADDR_GEN_MODE_NONE, IN6_ADDR_GEN_MODE_STABLE_PRIVACY, + IN6_ADDR_GEN_MODE_RANDOM, }; /* Bridge section */ -- cgit v1.2.3 From 1a8524794fc7c70f44ac28e3a6e8fd637bc41f14 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 16 Dec 2015 13:20:43 -0800 Subject: net: l3mdev: Add master device lookup by index Add helper to lookup l3mdev master index given a device index. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/l3mdev.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 774d85b2d5d9..786226f8e77b 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -51,6 +51,24 @@ static inline int l3mdev_master_ifindex(struct net_device *dev) return ifindex; } +static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + int rc = 0; + + if (likely(ifindex)) { + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (dev) + rc = l3mdev_master_ifindex_rcu(dev); + + rcu_read_unlock(); + } + + return rc; +} + /* get index of an interface to use for FIB lookups. For devices * enslaved to an L3 master device FIB lookups are based on the * master index @@ -167,6 +185,11 @@ static inline int l3mdev_master_ifindex(struct net_device *dev) return 0; } +static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) +{ + return 0; +} + static inline int l3mdev_fib_oif_rcu(struct net_device *dev) { return dev ? dev->ifindex : 0; -- cgit v1.2.3 From 6dd9a14e92e54895e143f10fef4d0b9abe109aa9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 16 Dec 2015 13:20:44 -0800 Subject: net: Allow accepted sockets to be bound to l3mdev domain Allow accepted sockets to derive their sk_bound_dev_if setting from the l3mdev domain in which the packets originated. A sysctl setting is added to control the behavior which is similar to sk_mark and sysctl_tcp_fwmark_accept. This effectively allow a process to have a "VRF-global" listen socket, with child sockets bound to the VRF device in which the packet originated. A similar behavior can be achieved using sk_mark, but a solution using marks is incomplete as it does not handle duplicate addresses in different L3 domains/VRFs. Allowing sockets to inherit the sk_bound_dev_if from l3mdev domain provides a complete solution. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/inet_sock.h | 14 ++++++++++++++ include/net/netns/ipv4.h | 3 +++ 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 625bdf95d673..012b1f91f3ec 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -28,6 +28,7 @@ #include #include #include +#include /** struct ip_options - IP Options * @@ -113,6 +114,19 @@ static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) return sk->sk_mark; } +static inline int inet_request_bound_dev_if(const struct sock *sk, + struct sk_buff *skb) +{ +#ifdef CONFIG_NET_L3_MASTER_DEV + struct net *net = sock_net(sk); + + if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept) + return l3mdev_master_ifindex_by_index(net, skb->skb_iif); +#endif + + return sk->sk_bound_dev_if; +} + struct inet_cork { unsigned int flags; __be32 addr; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index c68926b4899c..d75be32650ba 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -86,6 +86,9 @@ struct netns_ipv4 { int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; +#ifdef CONFIG_NET_L3_MASTER_DEV + int sysctl_tcp_l3mdev_accept; +#endif int sysctl_tcp_mtu_probing; int sysctl_tcp_base_mss; int sysctl_tcp_probe_threshold; -- cgit v1.2.3 From 05c74e5e53f6cb07502c3e6a820f33e2777b6605 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 17 Dec 2015 23:51:53 +0100 Subject: bpf: add bpf_skb_load_bytes helper When hacking tc programs with eBPF, one of the issues that come up from time to time is to load addresses from headers. In eBPF as in classic BPF, we have BPF_LD | BPF_ABS | BPF_{B,H,W} instructions that extract a byte, half-word or word out of the skb data though helpers such as bpf_load_pointer() (interpreter case). F.e. extracting a whole IPv6 address could possibly look like ... union v6addr { struct { __u32 p1; __u32 p2; __u32 p3; __u32 p4; }; __u8 addr[16]; }; [...] a.p1 = htonl(load_word(skb, off)); a.p2 = htonl(load_word(skb, off + 4)); a.p3 = htonl(load_word(skb, off + 8)); a.p4 = htonl(load_word(skb, off + 12)); [...] /* access to a.addr[...] */ This work adds a complementary helper bpf_skb_load_bytes() (we also have bpf_skb_store_bytes()) as an alternative where the same call would look like from an eBPF program: ret = bpf_skb_load_bytes(skb, off, addr, sizeof(addr)); Same verifier restrictions apply as in ffeedafbf023 ("bpf: introduce current->pid, tgid, uid, gid, comm accessors") case, where stack memory access needs to be statically verified and thus guaranteed to be initialized in first use (otherwise verifier cannot tell whether a subsequent access to it is valid or not as it's runtime dependent). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9ea2d22fa2cb..8bed7f1176b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -269,6 +269,7 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_perf_event_output, + BPF_FUNC_skb_load_bytes, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From b1f0a0e99c58fbd7ea053ca36ba623718272b618 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 21 Dec 2015 21:29:24 +0100 Subject: net: add inet_sk_transparent() helper Avoids cluttering tcp_v4_send_reset when followup patch extends it to deal with timewait sockets. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/request_sock.h | 2 +- include/net/tcp.h | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index a0dde04eb178..f49759decb28 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -68,7 +68,7 @@ struct request_sock { u32 peer_secid; }; -static inline struct request_sock *inet_reqsk(struct sock *sk) +static inline struct request_sock *inet_reqsk(const struct sock *sk) { return (struct request_sock *)sk; } diff --git a/include/net/tcp.h b/include/net/tcp.h index 3077735b348d..f33fecf4e282 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1620,6 +1620,18 @@ static inline void tcp_highest_sack_combine(struct sock *sk, tcp_sk(sk)->highest_sack = new; } +/* This helper checks if socket has IP_TRANSPARENT set */ +static inline bool inet_sk_transparent(const struct sock *sk) +{ + switch (sk->sk_state) { + case TCP_TIME_WAIT: + return inet_twsk(sk)->tw_transparent; + case TCP_NEW_SYN_RECV: + return inet_rsk(inet_reqsk(sk))->no_srccheck; + } + return inet_sk(sk)->transparent; +} + /* Determines whether this is a thin stream (which may suffer from * increased latency). Used to trigger latency-reducing mechanisms. */ -- cgit v1.2.3 From 039f50629b7f860f36644ed1f34b27da9aa62f43 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 24 Dec 2015 14:34:54 -0800 Subject: ip_tunnel: Move stats update to iptunnel_xmit() By moving stats update into iptunnel_xmit(), we can simplify iptunnel_xmit() usage. With this change there is no need to call another function (iptunnel_xmit_stats()) to update stats in tunnel xmit code path. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 17 ++++------------- include/net/ip_tunnels.h | 28 +++++++++++++++------------- include/net/udp_tunnel.h | 8 ++++---- 3 files changed, 23 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index ff788b665277..ae07e94778d8 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -5,6 +5,7 @@ #include #include #include +#include #define IP6TUNNEL_ERR_TIMEO (30*HZ) @@ -83,22 +84,12 @@ int ip6_tnl_get_iflink(const struct net_device *dev); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, struct net_device *dev) { - struct net_device_stats *stats = &dev->stats; int pkt_len, err; pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); - - if (net_xmit_eval(err) == 0) { - struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += pkt_len; - tstats->tx_packets++; - u64_stats_update_end(&tstats->syncp); - put_cpu_ptr(tstats); - } else { - stats->tx_errors++; - stats->tx_aborted_errors++; - } + if (unlikely(net_xmit_eval(err))) + pkt_len = -1; + iptunnel_xmit_stats(dev, pkt_len); } #endif diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 62a750a6a8f8..6db96ea0144f 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -273,32 +273,34 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, } int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); -int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, - __be32 src, __be32 dst, u8 proto, - u8 tos, u8 ttl, __be16 df, bool xnet); +void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, u8 proto, + u8 tos, u8 ttl, __be16 df, bool xnet); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum, int gso_type_mask); -static inline void iptunnel_xmit_stats(int err, - struct net_device_stats *err_stats, - struct pcpu_sw_netstats __percpu *stats) +static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) { - if (err > 0) { - struct pcpu_sw_netstats *tstats = get_cpu_ptr(stats); + if (pkt_len > 0) { + struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += err; + tstats->tx_bytes += pkt_len; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); put_cpu_ptr(tstats); - } else if (err < 0) { - err_stats->tx_errors++; - err_stats->tx_aborted_errors++; } else { - err_stats->tx_dropped++; + struct net_device_stats *err_stats = &dev->stats; + + if (pkt_len < 0) { + err_stats->tx_errors++; + err_stats->tx_aborted_errors++; + } else { + err_stats->tx_dropped++; + } } } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index cb2f89f20f5c..cca2ad3082c3 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -78,10 +78,10 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *sock_cfg); /* Transmit the skb using UDP encapsulation. */ -int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, - __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck); +void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, + __be16 df, __be16 src_port, __be16 dst_port, + bool xnet, bool nocheck); #if IS_ENABLED(CONFIG_IPV6) int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, -- cgit v1.2.3 From df05ef874b284d833c2d9795a6350c6a373ab6c9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 15 Dec 2015 19:39:32 +0100 Subject: netfilter: nf_tables: release objects on netns destruction We have to release the existing objects on netns removal otherwise we leak them. Chains are unregistered in first place to make sure no packets are walking on our rules and sets anymore. The object release happens by when we unregister the family via nft_release_afinfo() which is called from nft_unregister_afinfo() from the corresponding __net_exit path in every family. Reported-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index b313cda49194..a50f139ce087 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -880,7 +880,7 @@ struct nft_af_info { }; int nft_register_afinfo(struct net *, struct nft_af_info *); -void nft_unregister_afinfo(struct nft_af_info *); +void nft_unregister_afinfo(struct net *, struct nft_af_info *); int nft_register_chain_type(const struct nf_chain_type *); void nft_unregister_chain_type(const struct nf_chain_type *); -- cgit v1.2.3 From 5ebe0b0eec9d6f703b137f9b938c52f7b91dd9d6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 15 Dec 2015 19:40:49 +0100 Subject: netfilter: nf_tables: destroy basechain and rules on netdevice removal If the netdevice is destroyed, the resources that are attached should be released too as they belong to the device that is now gone. Suggested-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index a50f139ce087..0191fbb33a2f 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -821,10 +821,7 @@ static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chai return container_of(chain, struct nft_base_chain, chain); } -int nft_register_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops); -void nft_unregister_basechain(struct nft_base_chain *basechain, - unsigned int hook_nops); +int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); -- cgit v1.2.3 From 7b8002a1511fcbcb0596cac90d67ad5c8182d0aa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 15 Dec 2015 18:41:56 +0100 Subject: netfilter: nfnetlink: pass down netns pointer to call() and call_rcu() Adapt callsites to avoid recurrent lookup of the netns pointer. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 5646b24bfc64..ceacbf5dcb73 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -8,12 +8,12 @@ #include struct nfnl_callback { - int (*call)(struct sock *nl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]); - int (*call_rcu)(struct sock *nl, struct sk_buff *skb, + int (*call)(struct net *net, struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[]); + int (*call_rcu)(struct net *net, struct sock *nl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]); int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[]); -- cgit v1.2.3 From 5913beaf0d70f97135ed7191c028fd88b3848864 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 15 Dec 2015 19:41:57 +0100 Subject: netfilter: nfnetlink: pass down netns pointer to commit() and abort() callbacks Adapt callsites to avoid recurrent lookup of the netns pointer. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index ceacbf5dcb73..ba0d9789eb6e 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -26,8 +26,8 @@ struct nfnetlink_subsystem { __u8 subsys_id; /* nfnetlink subsystem ID */ __u8 cb_count; /* number of callbacks */ const struct nfnl_callback *cb; /* callback for individual types */ - int (*commit)(struct sk_buff *skb); - int (*abort)(struct sk_buff *skb); + int (*commit)(struct net *net, struct sk_buff *skb); + int (*abort)(struct net *net, struct sk_buff *skb); }; int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); -- cgit v1.2.3 From 9afec6d3866b8451abcf1a7a1a381a3be6c83386 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Wed, 23 Dec 2015 23:45:18 +0100 Subject: nfc: netlink: HCI event connectivity implementation Add support for missing HCI event EVT_CONNECTIVITY and forward it to userspace. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- include/net/nfc/nfc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index dcfcfc9c00bf..1a3de8b34ad2 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -299,6 +299,7 @@ void nfc_driver_failure(struct nfc_dev *dev, int err); int nfc_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction); +int nfc_se_connectivity(struct nfc_dev *dev, u8 se_idx); int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type); int nfc_remove_se(struct nfc_dev *dev, u32 se_idx); struct nfc_se *nfc_find_se(struct nfc_dev *dev, u32 se_idx); -- cgit v1.2.3 From 7be4fb643ef2d1058b897ba9dbe17bf5ced04391 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Wed, 23 Dec 2015 23:45:24 +0100 Subject: nfc: microread: Fix header comment microread platform_data header had an NXP header. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- include/linux/platform_data/microread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/platform_data/microread.h b/include/linux/platform_data/microread.h index cfda59b226ee..ca13992089b8 100644 --- a/include/linux/platform_data/microread.h +++ b/include/linux/platform_data/microread.h @@ -1,5 +1,5 @@ /* - * Driver include for the PN544 NFC chip. + * Driver include for the Inside Secure microread NFC Chip. * * Copyright (C) 2011 Tieto Poland * Copyright (C) 2012 Intel Corporation. All rights reserved. -- cgit v1.2.3 From f3a4094558ddf8afa8bb58250d548e15e059c65a Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 30 Dec 2015 16:28:25 +0100 Subject: ethtool: Add phy statistics Ethernet PHYs can maintain statistics, for example errors while idle and receive errors. Add an ethtool mechanism to retrieve these statistics, using the same model as MAC statistics. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 6 ++++++ include/uapi/linux/ethtool.h | 3 +++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 05fde31b6dc6..a89cb0eef911 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -589,6 +589,12 @@ struct phy_driver { int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data); + /* Get statistics from the phy using ethtool */ + int (*get_sset_count)(struct phy_device *dev); + void (*get_strings)(struct phy_device *dev, u8 *data); + void (*get_stats)(struct phy_device *dev, + struct ethtool_stats *stats, u64 *data); + struct device_driver driver; }; #define to_phy_driver(d) container_of(d, struct phy_driver, driver) diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index cd1629170103..57fa39005e79 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -542,6 +542,7 @@ struct ethtool_pauseparam { * now deprecated * @ETH_SS_FEATURES: Device feature names * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names + * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS */ enum ethtool_stringset { ETH_SS_TEST = 0, @@ -551,6 +552,7 @@ enum ethtool_stringset { ETH_SS_FEATURES, ETH_SS_RSS_HASH_FUNCS, ETH_SS_TUNABLES, + ETH_SS_PHY_STATS, }; /** @@ -1225,6 +1227,7 @@ enum ethtool_sfeatures_retval_bits { #define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ #define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ #define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ +#define ETHTOOL_GPHYSTATS 0x0000004a /* get PHY-specific statistics */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET -- cgit v1.2.3 From c7862a5f0de5f521c545f3436f0aa190964342dd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Dec 2015 18:21:44 +0100 Subject: netfilter: nft_limit: allow to invert matching criteria This patch allows you to invert the ratelimit matching criteria, so you can match packets over the ratelimit. This is required to support what hashlimit does. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b48a3ab761f8..22043ce95ae6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -780,6 +780,10 @@ enum nft_limit_type { NFT_LIMIT_PKT_BYTES }; +enum nft_limit_flags { + NFT_LIMIT_F_INV = (1 << 0), +}; + /** * enum nft_limit_attributes - nf_tables limit expression netlink attributes * @@ -787,6 +791,7 @@ enum nft_limit_type { * @NFTA_LIMIT_UNIT: refill unit (NLA_U64) * @NFTA_LIMIT_BURST: burst (NLA_U32) * @NFTA_LIMIT_TYPE: type of limit (NLA_U32: enum nft_limit_type) + * @NFTA_LIMIT_FLAGS: flags (NLA_U32: enum nft_limit_flags) */ enum nft_limit_attributes { NFTA_LIMIT_UNSPEC, @@ -794,6 +799,7 @@ enum nft_limit_attributes { NFTA_LIMIT_UNIT, NFTA_LIMIT_BURST, NFTA_LIMIT_TYPE, + NFTA_LIMIT_FLAGS, __NFTA_LIMIT_MAX }; #define NFTA_LIMIT_MAX (__NFTA_LIMIT_MAX - 1) -- cgit v1.2.3 From 502061f81d3eb4518d2e72178e494a8547788ad0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 3 Jan 2016 21:02:18 +0100 Subject: netfilter: nf_tables: add packet duplication to the netdev family You can use this to duplicate packets and inject them at the egress path of the specified interface. This duplication allows you to inspect traffic from the dummy or any other interface dedicated to this purpose. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_dup_netdev.h | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 include/net/netfilter/nf_dup_netdev.h (limited to 'include') diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h new file mode 100644 index 000000000000..397dcae349f9 --- /dev/null +++ b/include/net/netfilter/nf_dup_netdev.h @@ -0,0 +1,6 @@ +#ifndef _NF_DUP_NETDEV_H_ +#define _NF_DUP_NETDEV_H_ + +void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif); + +#endif -- cgit v1.2.3 From 39e6dea28adc874f7021e5580c13cab0b58407ea Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 25 Nov 2015 13:39:38 +0100 Subject: netfilter: nf_tables: add forward expression to the netdev family You can use this to forward packets from ingress to the egress path of the specified interface. This provides a fast path to bounce packets from one interface to another specific destination interface. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 22043ce95ae6..731288a039f6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -983,6 +983,18 @@ enum nft_dup_attributes { }; #define NFTA_DUP_MAX (__NFTA_DUP_MAX - 1) +/** + * enum nft_fwd_attributes - nf_tables fwd expression netlink attributes + * + * @NFTA_FWD_SREG_DEV: source register of output interface (NLA_U32: nft_register) + */ +enum nft_fwd_attributes { + NFTA_FWD_UNSPEC, + NFTA_FWD_SREG_DEV, + __NFTA_FWD_MAX +}; +#define NFTA_FWD_MAX (__NFTA_FWD_MAX - 1) + /** * enum nft_gen_attributes - nf_tables ruleset generation attributes * -- cgit v1.2.3 From 29663b0cc1d5b9b6e2f6caf41e86c599a0310def Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 23 Dec 2015 22:36:32 +0100 Subject: mac802154: constify ieee802154_llsec_ops structure The ieee802154_llsec_ops structure is never modified, so declare it as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/ieee802154_netdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h index a62a051a3a2f..c4b31601cd53 100644 --- a/include/net/ieee802154_netdev.h +++ b/include/net/ieee802154_netdev.h @@ -337,7 +337,7 @@ struct ieee802154_mlme_ops { void (*get_mac_params)(struct net_device *dev, struct ieee802154_mac_params *params); - struct ieee802154_llsec_ops *llsec; + const struct ieee802154_llsec_ops *llsec; }; static inline struct ieee802154_mlme_ops * -- cgit v1.2.3 From 888cc8c20cf265fcd1302f6c5d6be07628ba66c7 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Mon, 28 Dec 2015 02:07:08 +0300 Subject: sh_eth: remove EDMAC_BIG_ENDIAN Commit 71557a37adb5 ("[netdrvr] sh_eth: Add SH7619 support") added support for the big-endian EDMAC descriptors. However, it was never used and never worked right until the recent driver fixes. I think we now can just remove this support, it was only burdening the driver from the start. It should be easy to do without disturbing the SH platform code, at least for now... Signed-off-by: Sergei Shtylyov Acked-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/sh_eth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sh_eth.h b/include/linux/sh_eth.h index 8c9131db2b25..f2e27e078362 100644 --- a/include/linux/sh_eth.h +++ b/include/linux/sh_eth.h @@ -4,7 +4,7 @@ #include #include -enum {EDMAC_LITTLE_ENDIAN, EDMAC_BIG_ENDIAN}; +enum {EDMAC_LITTLE_ENDIAN}; struct sh_eth_plat_data { int phy; -- cgit v1.2.3 From 0efeff2905d0ea14f2ee83e6cefbda35ee8cf6fc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 1 Jan 2016 13:18:48 +0100 Subject: net: make ip6tunnel_xmit definition conditional Moving the caller of iptunnel_xmit_stats causes a build error in randconfig builds that disable CONFIG_INET: In file included from ../net/xfrm/xfrm_input.c:17:0: ../include/net/ip6_tunnel.h: In function 'ip6tunnel_xmit': ../include/net/ip6_tunnel.h:93:2: error: implicit declaration of function 'iptunnel_xmit_stats' [-Werror=implicit-function-declaration] iptunnel_xmit_stats(dev, pkt_len); The reason is that the iptunnel_xmit_stats definition is hidden inside #ifdef CONFIG_INET but the caller is not. We can change one or the other to fix it, and this patch adds a second #ifdef around ip6tunnel_xmit() to avoid seeing the invalid call. Signed-off-by: Arnd Bergmann Fixes: 039f50629b7f ("ip_tunnel: Move stats update to iptunnel_xmit()") Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index ae07e94778d8..0d0ce0b2d870 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -81,6 +81,7 @@ __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, struct net *ip6_tnl_get_link_net(const struct net_device *dev); int ip6_tnl_get_iflink(const struct net_device *dev); +#ifdef CONFIG_INET static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, struct net_device *dev) { @@ -93,3 +94,4 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, iptunnel_xmit_stats(dev, pkt_len); } #endif +#endif -- cgit v1.2.3 From ef456144da8ef507c8cf504284b6042e9201a05c Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 4 Jan 2016 17:41:45 -0500 Subject: soreuseport: define reuseport groups struct sock_reuseport is an optional shared structure referenced by each socket belonging to a reuseport group. When a socket is bound to an address/port not yet in use and the reuseport flag has been set, the structure will be allocated and attached to the newly bound socket. When subsequent calls to bind are made for the same address/port, the shared structure will be updated to include the new socket and the newly bound socket will reference the group structure. Usually, when an incoming packet was destined for a reuseport group, all sockets in the same group needed to be considered before a dispatching decision was made. With this structure, an appropriate socket can be found after looking up just one socket in the group. This shared structure will also allow for more complicated decisions to be made when selecting a socket (eg a BPF filter). This work is based off a similar implementation written by Ying Cai for implementing policy-based reuseport selection. Signed-off-by: Craig Gallek Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ include/net/sock_reuseport.h | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 include/net/sock_reuseport.h (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 3794cdde837a..e830c1006935 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -318,6 +318,7 @@ struct cg_proto; * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 + * @sk_reuseport_cb: reuseport group container */ struct sock { /* @@ -453,6 +454,7 @@ struct sock { int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); + struct sock_reuseport __rcu *sk_reuseport_cb; }; #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h new file mode 100644 index 000000000000..67d1eb8fd7af --- /dev/null +++ b/include/net/sock_reuseport.h @@ -0,0 +1,20 @@ +#ifndef _SOCK_REUSEPORT_H +#define _SOCK_REUSEPORT_H + +#include +#include + +struct sock_reuseport { + struct rcu_head rcu; + + u16 max_socks; /* length of socks */ + u16 num_socks; /* elements in socks */ + struct sock *socks[0]; /* array of sock pointers */ +}; + +extern int reuseport_alloc(struct sock *sk); +extern int reuseport_add_sock(struct sock *sk, const struct sock *sk2); +extern void reuseport_detach_sock(struct sock *sk); +extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash); + +#endif /* _SOCK_REUSEPORT_H */ -- cgit v1.2.3 From e32ea7e747271a0abcd37e265005e97cc81d9df5 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 4 Jan 2016 17:41:46 -0500 Subject: soreuseport: fast reuseport UDP socket selection Include a struct sock_reuseport instance when a UDP socket binds to a specific address for the first time with the reuseport flag set. When selecting a socket for an incoming UDP packet, use the information available in sock_reuseport if present. This required adding an additional field to the UDP source address equality function to differentiate between exact and wildcard matches. The original use case allowed wildcard matches when checking for existing port uses during bind. The new use case of adding a socket to a reuseport group requires exact address matching. Performance test (using a machine with 2 CPU sockets and a total of 48 cores): Create reuseport groups of varying size. Use one socket from this group per user thread (pinning each thread to a different core) calling recvmmsg in a tight loop. Record number of messages received per second while saturating a 10G link. 10 sockets: 18% increase (~2.8M -> 3.3M pkts/s) 20 sockets: 14% increase (~2.9M -> 3.3M pkts/s) 40 sockets: 13% increase (~3.0M -> 3.4M pkts/s) This work is based off a similar implementation written by Ying Cai for implementing policy-based reuseport selection. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- include/net/addrconf.h | 3 ++- include/net/udp.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 78003dfb8539..47f52d3cd8df 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -87,7 +87,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2); +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); diff --git a/include/net/udp.h b/include/net/udp.h index 6d4ed18e1427..3b5d7f93bc23 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -191,7 +191,7 @@ static inline void udp_lib_close(struct sock *sk, long timeout) } int udp_lib_get_port(struct sock *sk, unsigned short snum, - int (*)(const struct sock *, const struct sock *), + int (*)(const struct sock *, const struct sock *, bool), unsigned int hash2_nulladdr); u32 udp_flow_hashrnd(void); -- cgit v1.2.3 From 538950a1b7527a0a52ccd9337e3fcd304f027f13 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 4 Jan 2016 17:41:47 -0500 Subject: soreuseport: setsockopt SO_ATTACH_REUSEPORT_[CE]BPF Expose socket options for setting a classic or extended BPF program for use when selecting sockets in an SO_REUSEPORT group. These options can be used on the first socket to belong to a group before bind or on any socket in the group after bind. This change includes refactoring of the existing sk_filter code to allow reuse of the existing BPF filter validation checks. Signed-off-by: Craig Gallek Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 2 ++ include/net/sock_reuseport.h | 10 +++++++++- include/net/udp.h | 5 +++-- include/uapi/asm-generic/socket.h | 3 +++ 4 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4165e9ac9e36..294c3cdf07b3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -447,6 +447,8 @@ void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); +int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); +int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 67d1eb8fd7af..7dda3d7adba8 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -1,6 +1,8 @@ #ifndef _SOCK_REUSEPORT_H #define _SOCK_REUSEPORT_H +#include +#include #include #include @@ -9,12 +11,18 @@ struct sock_reuseport { u16 max_socks; /* length of socks */ u16 num_socks; /* elements in socks */ + struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; extern int reuseport_alloc(struct sock *sk); extern int reuseport_add_sock(struct sock *sk, const struct sock *sk2); extern void reuseport_detach_sock(struct sock *sk); -extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash); +extern struct sock *reuseport_select_sock(struct sock *sk, + u32 hash, + struct sk_buff *skb, + int hdr_len); +extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, + struct bpf_prog *prog); #endif /* _SOCK_REUSEPORT_H */ diff --git a/include/net/udp.h b/include/net/udp.h index 3b5d7f93bc23..2842541e28e7 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -258,7 +258,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, - struct udp_table *tbl); + struct udp_table *tbl, struct sk_buff *skb); struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, @@ -266,7 +266,8 @@ struct sock *udp6_lib_lookup(struct net *net, struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, - int dif, struct udp_table *tbl); + int dif, struct udp_table *tbl, + struct sk_buff *skb); /* * SNMP statistics for UDP and UDP-Lite diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 5c15c2a5c123..fb8a41668382 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -87,4 +87,7 @@ #define SO_ATTACH_BPF 50 #define SO_DETACH_BPF SO_DETACH_FILTER +#define SO_ATTACH_REUSEPORT_CBPF 51 +#define SO_ATTACH_REUSEPORT_EBPF 52 + #endif /* __ASM_GENERIC_SOCKET_H */ -- cgit v1.2.3 From 0d3b7f64c84d53658daf28e2f9772e38acb9340d Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 5 Jan 2016 13:19:31 +0200 Subject: Bluetooth: Change eir_has_data_type() to more generic eir_get_data() To make the EIR parsing helper more general purpose, make it return the found data and its length rather than just saying whether the data was present or not. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c95e0326c41a..372e2a7c4ada 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1283,31 +1283,41 @@ static inline void hci_role_switch_cfm(struct hci_conn *conn, __u8 status, mutex_unlock(&hci_cb_list_lock); } -static inline bool eir_has_data_type(u8 *data, size_t data_len, u8 type) +static inline void *eir_get_data(u8 *eir, size_t eir_len, u8 type, + size_t *data_len) { size_t parsed = 0; - if (data_len < 2) - return false; + if (eir_len < 2) + return NULL; - while (parsed < data_len - 1) { - u8 field_len = data[0]; + while (parsed < eir_len - 1) { + u8 field_len = eir[0]; if (field_len == 0) break; parsed += field_len + 1; - if (parsed > data_len) + if (parsed > eir_len) break; - if (data[1] == type) - return true; + if (eir[1] != type) { + eir += field_len + 1; + continue; + } + + /* Zero length data */ + if (field_len == 1) + return NULL; - data += field_len + 1; + if (data_len) + *data_len = field_len - 1; + + return &eir[2]; } - return false; + return NULL; } static inline bool hci_bdaddr_is_rpa(bdaddr_t *bdaddr, u8 addr_type) -- cgit v1.2.3 From 78b781ca0d35191ebf8d8cad8beec810270f0f2e Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Tue, 5 Jan 2016 13:19:32 +0200 Subject: Bluetooth: Add support for Start Limited Discovery command This patch implements the mgmt Start Limited Discovery command. Most of existing Start Discovery code is reused since the only difference is the presence of a 'limited' flag as part of the discovery state. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + include/net/bluetooth/mgmt.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 372e2a7c4ada..d4f82edb5cff 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -77,6 +77,7 @@ struct discovery_state { u8 last_adv_data_len; bool report_invalid_rssi; bool result_filtering; + bool limited; s8 rssi; u16 uuid_count; u8 (*uuids)[16]; diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index af17774c9416..ea73e0826aa7 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -584,6 +584,8 @@ struct mgmt_rp_get_adv_size_info { __u8 max_scan_rsp_len; } __packed; +#define MGMT_OP_START_LIMITED_DISCOVERY 0x0041 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; -- cgit v1.2.3 From d6c0256a60e685214cc8cc2b886809f11efc0084 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 30 Dec 2015 23:50:46 +0800 Subject: sctp: add the rhashtable apis for sctp global transport hashtable tranport hashtbale will replace the association hashtable to do the lookup for transport, and then get association by t->assoc, rhashtable apis will be used because of it's resizable, scalable and using rcu. lport + rport + paddr will be the base hashkey to locate the chain, with net to protect one netns from another, then plus the laddr to compare to get the target. this patch will provider the lookup functions: - sctp_epaddr_lookup_transport - sctp_addrs_lookup_transport hash/unhash functions: - sctp_hash_transport - sctp_unhash_transport init/destroy functions: - sctp_transport_hashtable_init - sctp_transport_hashtable_destroy Signed-off-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 11 +++++++++++ include/net/sctp/structs.h | 5 +++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index ce13cf20f625..7bbdfbab2efa 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -143,6 +143,17 @@ void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_transport *t); void sctp_backlog_migrate(struct sctp_association *assoc, struct sock *oldsk, struct sock *newsk); +int sctp_transport_hashtable_init(void); +void sctp_transport_hashtable_destroy(void); +void sctp_hash_transport(struct sctp_transport *t); +void sctp_unhash_transport(struct sctp_transport *t); +struct sctp_transport *sctp_addrs_lookup_transport( + struct net *net, + const union sctp_addr *laddr, + const union sctp_addr *paddr); +struct sctp_transport *sctp_epaddr_lookup_transport( + const struct sctp_endpoint *ep, + const union sctp_addr *paddr); /* * sctp/proc.c diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index eea9bdeecba2..4ab87d08e766 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -48,6 +48,7 @@ #define __sctp_structs_h__ #include +#include #include /* linux/in.h needs this!! */ #include /* We get struct sockaddr_in. */ #include /* We get struct in6_addr */ @@ -123,6 +124,8 @@ extern struct sctp_globals { struct sctp_hashbucket *assoc_hashtable; /* This is the sctp port control hash. */ struct sctp_bind_hashbucket *port_hashtable; + /* This is the hash of all transports. */ + struct rhashtable transport_hashtable; /* Sizes of above hashtables. */ int ep_hashsize; @@ -147,6 +150,7 @@ extern struct sctp_globals { #define sctp_assoc_hashtable (sctp_globals.assoc_hashtable) #define sctp_port_hashsize (sctp_globals.port_hashsize) #define sctp_port_hashtable (sctp_globals.port_hashtable) +#define sctp_transport_hashtable (sctp_globals.transport_hashtable) #define sctp_checksum_disable (sctp_globals.checksum_disable) /* SCTP Socket type: UDP or TCP style. */ @@ -753,6 +757,7 @@ static inline int sctp_packet_empty(struct sctp_packet *packet) struct sctp_transport { /* A list of transports. */ struct list_head transports; + struct rhash_head node; /* Reference counting. */ atomic_t refcnt; -- cgit v1.2.3 From b5eff7128366c4a7a9b502097a968ec9cae2bea2 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 30 Dec 2015 23:50:49 +0800 Subject: sctp: drop the old assoc hashtable of sctp transport hashtable will replace the association hashtable, so association hashtable is not used in sctp any more, so drop the codes about that. Signed-off-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 21 --------------------- include/net/sctp/structs.h | 5 ----- 2 files changed, 26 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 7bbdfbab2efa..835aa2ed9870 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -126,8 +126,6 @@ int sctp_primitive_ASCONF(struct net *, struct sctp_association *, void *arg); */ int sctp_rcv(struct sk_buff *skb); void sctp_v4_err(struct sk_buff *skb, u32 info); -void sctp_hash_established(struct sctp_association *); -void sctp_unhash_established(struct sctp_association *); void sctp_hash_endpoint(struct sctp_endpoint *); void sctp_unhash_endpoint(struct sctp_endpoint *); struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *, @@ -530,25 +528,6 @@ static inline int sctp_ep_hashfn(struct net *net, __u16 lport) return (net_hash_mix(net) + lport) & (sctp_ep_hashsize - 1); } -/* This is the hash function for the association hash table. */ -static inline int sctp_assoc_hashfn(struct net *net, __u16 lport, __u16 rport) -{ - int h = (lport << 16) + rport + net_hash_mix(net); - h ^= h>>8; - return h & (sctp_assoc_hashsize - 1); -} - -/* This is the hash function for the association hash table. This is - * not used yet, but could be used as a better hash function when - * we have a vtag. - */ -static inline int sctp_vtag_hashfn(__u16 lport, __u16 rport, __u32 vtag) -{ - int h = (lport << 16) + rport; - h ^= vtag; - return h & (sctp_assoc_hashsize - 1); -} - #define sctp_for_each_hentry(epb, head) \ hlist_for_each_entry(epb, head, node) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 4ab87d08e766..20e72129be1c 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -120,8 +120,6 @@ extern struct sctp_globals { /* This is the hash of all endpoints. */ struct sctp_hashbucket *ep_hashtable; - /* This is the hash of all associations. */ - struct sctp_hashbucket *assoc_hashtable; /* This is the sctp port control hash. */ struct sctp_bind_hashbucket *port_hashtable; /* This is the hash of all transports. */ @@ -129,7 +127,6 @@ extern struct sctp_globals { /* Sizes of above hashtables. */ int ep_hashsize; - int assoc_hashsize; int port_hashsize; /* Default initialization values to be applied to new associations. */ @@ -146,8 +143,6 @@ extern struct sctp_globals { #define sctp_address_families (sctp_globals.address_families) #define sctp_ep_hashsize (sctp_globals.ep_hashsize) #define sctp_ep_hashtable (sctp_globals.ep_hashtable) -#define sctp_assoc_hashsize (sctp_globals.assoc_hashsize) -#define sctp_assoc_hashtable (sctp_globals.assoc_hashtable) #define sctp_port_hashsize (sctp_globals.port_hashsize) #define sctp_port_hashtable (sctp_globals.port_hashtable) #define sctp_transport_hashtable (sctp_globals.transport_hashtable) -- cgit v1.2.3 From b0844444590e18704644f707ea88bff1b976b0e7 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Tue, 29 Dec 2015 14:58:30 +0200 Subject: net/mlx5_core: Introduce access function to read internal timer A preparation step which adds support for reading the hardware internal timer and the hardware timestamping from the CQE. In addition, advertize device_frequency_khz HCA capability. Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 20 +++++++++++++++++--- include/linux/mlx5/mlx5_ifc.h | 6 +++--- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 7d3a85faefb7..df2f79ef3cac 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -443,9 +443,12 @@ struct mlx5_init_seg { __be32 rsvd1[120]; __be32 initializing; struct health_buffer health; - __be32 rsvd2[884]; + __be32 rsvd2[880]; + __be32 internal_timer_h; + __be32 internal_timer_l; + __be32 rsrv3[2]; __be32 health_counter; - __be32 rsvd3[1019]; + __be32 rsvd4[1019]; __be64 ieee1588_clk; __be32 ieee1588_clk_type; __be32 clr_intx; @@ -601,7 +604,8 @@ struct mlx5_cqe64 { __be32 imm_inval_pkey; u8 rsvd40[4]; __be32 byte_cnt; - __be64 timestamp; + __be32 timestamp_h; + __be32 timestamp_l; __be32 sop_drop_qpn; __be16 wqe_counter; u8 signature; @@ -623,6 +627,16 @@ static inline int cqe_has_vlan(struct mlx5_cqe64 *cqe) return !!(cqe->l4_hdr_type_etc & 0x1); } +static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe) +{ + u32 hi, lo; + + hi = be32_to_cpu(cqe->timestamp_h); + lo = be32_to_cpu(cqe->timestamp_l); + + return (u64)lo | ((u64)hi << 32); +} + enum { CQE_L4_HDR_TYPE_NONE = 0x0, CQE_L4_HDR_TYPE_TCP_NO_ACK = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 131a2737cfa3..1780a85a8797 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -829,9 +829,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_66[0x8]; u8 log_uar_page_sz[0x10]; - u8 reserved_67[0xe0]; - - u8 reserved_68[0x1f]; + u8 reserved_67[0x40]; + u8 device_frequency_khz[0x20]; + u8 reserved_68[0x5f]; u8 cqe_zip[0x1]; u8 cqe_zip_timeout[0x10]; -- cgit v1.2.3 From 2fbf575867e5a181a3f3e5e29a2f0c205cca5fb3 Mon Sep 17 00:00:00 2001 From: "xypron.glpk@gmx.de" Date: Tue, 5 Jan 2016 10:12:49 +0100 Subject: include/uapi/linux/sockios.h: mark SIOCRTMSG unused IOCTL SIOCRTMSG does nothing but return EINVAL. So comment it as unused. SIOCRTMSG is only used in: * net/ipv4/af_inet.c * include/uapi/linux/sockios.h inet_ioctl calls ip_rt_ioctl. ip_rt_ioctl only handles SIOCADDRT and SIOCDELRT and returns -EINVAL otherwise. Signed-off-by: Heinrich Schuchardt Signed-off-by: David S. Miller --- include/uapi/linux/sockios.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h index e888b1aed69f..8e7890b26d9a 100644 --- a/include/uapi/linux/sockios.h +++ b/include/uapi/linux/sockios.h @@ -27,7 +27,7 @@ /* Routing table calls. */ #define SIOCADDRT 0x890B /* add routing table entry */ #define SIOCDELRT 0x890C /* delete routing table entry */ -#define SIOCRTMSG 0x890D /* call to routing system */ +#define SIOCRTMSG 0x890D /* unused */ /* Socket configuration controls. */ #define SIOCGIFNAME 0x8910 /* get iface name */ -- cgit v1.2.3 From a72a5e2d34ec2921c0d9a7545093087e4cb90d0a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 5 Jan 2016 22:17:55 +0100 Subject: inet: kill unused skb_free op The only user was removed in commit 029f7f3b8701cc7a ("netfilter: ipv6: nf_defrag: avoid/free clone operations"). Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/inet_frag.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index ac42bbb37b2d..12aac0fd6ee7 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -99,7 +99,6 @@ struct inet_frags { void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); - void (*skb_free)(struct sk_buff *); void (*frag_expire)(unsigned long data); struct kmem_cache *frags_cachep; const char *frags_cache_name; -- cgit v1.2.3 From 81435c33e062cbd4508da6f64655cb0967eeb65f Mon Sep 17 00:00:00 2001 From: Elad Raz Date: Wed, 6 Jan 2016 13:01:05 +0100 Subject: switchdev: add bridge vlan_filtering attribute Adding vlan_filtering attribute to allow hardware vendor to support vlan-aware bridges. Vlan_filtering is a per-bridge attribute. Signed-off-by: Elad Raz Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/switchdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 6612946167fe..603ae2f88dbb 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -47,6 +47,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_PORT_STP_STATE, SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, + SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, }; struct switchdev_attr { @@ -58,6 +59,7 @@ struct switchdev_attr { u8 stp_state; /* PORT_STP_STATE */ unsigned long brport_flags; /* PORT_BRIDGE_FLAGS */ u32 ageing_time; /* BRIDGE_AGEING_TIME */ + bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ } u; }; -- cgit v1.2.3 From cdba756f5803a2f0a8bbc6605acc166dd817979e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Jan 2016 06:53:50 -0800 Subject: net: move ndo_features_check() close to ndo_start_xmit() TX fast path uses ndo_start_xmit(), ndo_features_check() and ndo_select_queue(). Move ndo_features_check() close to ndo_start_xmit() to increase data locality. All "struct net_device_ops" should now be using C99 initializers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c20b814e46a0..8d8e5ca951b4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -812,6 +812,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX) * Required can not be NULL. * + * netdev_features_t (*ndo_fix_features)(struct net_device *dev, + * netdev_features_t features); + * Adjusts the requested feature flags according to device-specific + * constraints, and returns the resulting flags. Must not modify + * the device state. + * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, * void *accel_priv, select_queue_fallback_t fallback); * Called to decide which queue to when device supports multiple @@ -959,12 +965,6 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * Called to release previously enslaved netdev. * * Feature/offload setting functions. - * netdev_features_t (*ndo_fix_features)(struct net_device *dev, - * netdev_features_t features); - * Adjusts the requested feature flags according to device-specific - * constraints, and returns the resulting flags. Must not modify - * the device state. - * * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features); * Called to update device configuration to new features. Passed * feature set might be less than what was returned by ndo_fix_features()). @@ -1081,8 +1081,11 @@ struct net_device_ops { void (*ndo_uninit)(struct net_device *dev); int (*ndo_open)(struct net_device *dev); int (*ndo_stop)(struct net_device *dev); - netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb, - struct net_device *dev); + netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb, + struct net_device *dev); + netdev_features_t (*ndo_features_check)(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, void *accel_priv, @@ -1245,9 +1248,6 @@ struct net_device_ops { struct net_device *dev, void *priv); int (*ndo_get_lock_subclass)(struct net_device *dev); - netdev_features_t (*ndo_features_check) (struct sk_buff *skb, - struct net_device *dev, - netdev_features_t features); int (*ndo_set_tx_maxrate)(struct net_device *dev, int queue_index, u32 maxrate); -- cgit v1.2.3 From c7f5d105495a38ed09e70d825f75d9d7d5407264 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 5 Nov 2015 11:34:57 -0500 Subject: net: Add eth_platform_get_mac_address() helper. A repeating pattern in drivers has become to use OF node information and, if not found, platform specific host information to extract the ethernet address for a given device. Currently this is done with a call to of_get_mac_address() and then some ifdef'd stuff for SPARC. Consolidate this into a portable routine, and provide the arch_get_platform_mac_address() weak function hook for all architectures to implement if they want. Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index eb049c622208..37ff4a6faa9a 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -29,6 +29,9 @@ #include #ifdef __KERNEL__ +struct device; +int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); +unsigned char *arch_get_platform_get_mac_address(void); u32 eth_get_headlen(void *data, unsigned int max_len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; -- cgit v1.2.3 From ccaa953e9fc7ebb90fba4e4815966683bef4866f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:06 +0100 Subject: phy: Consistently use addr for address on an MII bus Within phy.h, an address on an MII bus has been called both addr and phy_id. phy_id is particularly confusion, since it also means the ID found in register 3, if the device on the bus is a phy. Consistently use addr. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index a89cb0eef911..77b5e56e2a92 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -158,8 +158,8 @@ struct mii_bus { const char *name; char id[MII_BUS_ID_SIZE]; void *priv; - int (*read)(struct mii_bus *bus, int phy_id, int regnum); - int (*write)(struct mii_bus *bus, int phy_id, int regnum, u16 val); + int (*read)(struct mii_bus *bus, int addr, int regnum); + int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); int (*reset)(struct mii_bus *bus); /* -- cgit v1.2.3 From bac83c653799d7ea3f6cc4d7396d75adc5e0f778 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:07 +0100 Subject: mdio: Move mdiobus_read/write operatings into mdio.h These are logically MDIO operations, not phy operations, so move them into the mdio header. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 6 ++++++ include/linux/phy.h | 6 +----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index b42963bc81dd..0d073c23c10d 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -11,6 +11,7 @@ #include +struct mii_bus; static inline bool mdio_phy_id_is_c45(int phy_id) { @@ -173,4 +174,9 @@ static inline u16 ethtool_adv_to_mmd_eee_adv_t(u32 adv) return reg; } +int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum); +int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum); +int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); +int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val); + #endif /* __LINUX_MDIO_H__ */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 77b5e56e2a92..8ca161a37e8a 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -212,11 +213,6 @@ static inline struct mii_bus *devm_mdiobus_alloc(struct device *dev) void devm_mdiobus_free(struct device *dev, struct mii_bus *bus); struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr); -int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum); -int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum); -int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); -int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val); - #define PHY_INTERRUPT_DISABLED 0x0 #define PHY_INTERRUPT_ENABLED 0x80000000 -- cgit v1.2.3 From 72ba48be3ec8e70937ad97d4420ef7144617c64b Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:09 +0100 Subject: phy: Add phydev_err() and phydev_dbg() macros In preparation for moving some of the phy_device structure members, add macros for printing errors and debug information. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 8ca161a37e8a..dbcf9fdd960c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -777,6 +777,12 @@ static inline int phy_read_status(struct phy_device *phydev) return phydev->drv->read_status(phydev); } +#define phydev_err(_phydev, format, args...) \ + dev_err(&_phydev->dev, format, ##args) + +#define phydev_dbg(_phydev, format, args...) \ + dev_dbg(&_phydev->dev, format, ##args) + int genphy_config_init(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); -- cgit v1.2.3 From 84eff6d194df442bee62c129f2f47efb0dbd0468 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:10 +0100 Subject: phy: add phydev_name() wrapper Add a phydev_name() function, to help with moving some structure members from phy_device. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index dbcf9fdd960c..5f5cc3424b9e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -783,6 +783,11 @@ static inline int phy_read_status(struct phy_device *phydev) #define phydev_dbg(_phydev, format, args...) \ dev_dbg(&_phydev->dev, format, ##args) +static inline const char *phydev_name(const struct phy_device *phydev) +{ + return dev_name(&phydev->dev); +} + int genphy_config_init(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); -- cgit v1.2.3 From 053e7e169229adebbc27fc176c5369398e9f5eba Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:12 +0100 Subject: phy: phy_{read|write}_mmd_indirect: get addr from phydev The address of the device can be determined from the phydev structure, rather than passing it as a parameter. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 5f5cc3424b9e..08198ce98773 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -629,14 +629,12 @@ static inline int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum) * phy_read_mmd_indirect - reads data from the MMD registers * @phydev: The PHY device bus * @prtad: MMD Address - * @devad: MMD DEVAD * @addr: PHY address on the MII bus * * Description: it reads data from the MMD registers (clause 22 to access to * clause 45) of the specified phy address. */ -int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, - int devad, int addr); +int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, int devad); /** * phy_read - Convenience function for reading a given PHY register @@ -735,14 +733,13 @@ static inline int phy_write_mmd(struct phy_device *phydev, int devad, * @phydev: The PHY device * @prtad: MMD Address * @devad: MMD DEVAD - * @addr: PHY address on the MII bus * @data: data to write in the MMD register * * Description: Write data from the MMD registers of the specified * phy address. */ void phy_write_mmd_indirect(struct phy_device *phydev, int prtad, - int devad, int addr, u32 data); + int devad, u32 data); struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id, bool is_c45, -- cgit v1.2.3 From 2220943a21e26d97d7fd8f83c004b947326b469d Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:13 +0100 Subject: phy: Centralise print about attached phy Many Ethernet drivers contain the same netdev_info() print statement about the attached phy. Move it into the phy device code. Additionally add a varargs function which can be used to append additional information. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 08198ce98773..ecbf6382ba29 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -16,6 +16,7 @@ #ifndef __PHY_H #define __PHY_H +#include #include #include #include @@ -785,6 +786,9 @@ static inline const char *phydev_name(const struct phy_device *phydev) return dev_name(&phydev->dev); } +void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) + __printf(2, 3); +void phy_attached_info(struct phy_device *phydev); int genphy_config_init(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); -- cgit v1.2.3 From e7f4dc3536a40097f95103ddf98dd55b3a980f5b Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:15 +0100 Subject: mdio: Move allocation of interrupts into core Have mdio_alloc() create the array of interrupt numbers, and initialize it to POLLING. This is what most MDIO drivers want, so allowing code to be removed from the drivers. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index ecbf6382ba29..a5473c9e19de 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -189,10 +189,10 @@ struct mii_bus { u32 phy_ignore_ta_mask; /* - * Pointer to an array of interrupts, each PHY's - * interrupt at the index matching its address + * An array of interrupts, each PHY's interrupt at the index + * matching its address */ - int *irq; + int irq[PHY_MAX_ADDR]; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) -- cgit v1.2.3 From e5a03bfd873c29eb786655ef2e95e53ed242b404 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:16 +0100 Subject: phy: Add an mdio_device structure Not all devices attached to an MDIO bus are phys. So add an mdio_device structure to represent the generic parts of an mdio device, and place this structure into the phy_device. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio.h | 9 +++++++++ include/linux/phy.h | 26 +++++++++++--------------- 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 0d073c23c10d..94f9f1491cde 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -13,6 +13,15 @@ struct mii_bus; +struct mdio_device { + struct device dev; + + struct mii_bus *bus; + /* Bus address of the MDIO device (0-31) */ + int addr; +}; +#define to_mdio_device(d) container_of(d, struct mdio_device, dev) + static inline bool mdio_phy_id_is_c45(int phy_id) { return (phy_id & MDIO_PHY_ID_C45) && !(phy_id & ~MDIO_PHY_ID_C45_MASK); diff --git a/include/linux/phy.h b/include/linux/phy.h index a5473c9e19de..239a0c2bc49d 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -358,14 +358,12 @@ struct phy_c45_device_ids { * handling, as well as handling shifts in PHY hardware state */ struct phy_device { + struct mdio_device mdio; + /* Information about the PHY type */ /* And management functions */ struct phy_driver *drv; - struct mii_bus *bus; - - struct device dev; - u32 phy_id; struct phy_c45_device_ids c45_ids; @@ -381,9 +379,6 @@ struct phy_device { phy_interface_t interface; - /* Bus address of the PHY (0-31) */ - int addr; - /* * forced speed & duplex (no autoneg) * partner speed & duplex & pause (autoneg) @@ -432,7 +427,8 @@ struct phy_device { void (*adjust_link)(struct net_device *dev); }; -#define to_phy_device(d) container_of(d, struct phy_device, dev) +#define to_phy_device(d) container_of(to_mdio_device(d), \ + struct phy_device, mdio) /* struct phy_driver: Driver structure for a particular PHY type * @@ -622,7 +618,7 @@ static inline int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum) if (!phydev->is_c45) return -EOPNOTSUPP; - return mdiobus_read(phydev->bus, phydev->addr, + return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff)); } @@ -648,7 +644,7 @@ int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, int devad); */ static inline int phy_read(struct phy_device *phydev, u32 regnum) { - return mdiobus_read(phydev->bus, phydev->addr, regnum); + return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum); } /** @@ -663,7 +659,7 @@ static inline int phy_read(struct phy_device *phydev, u32 regnum) */ static inline int phy_write(struct phy_device *phydev, u32 regnum, u16 val) { - return mdiobus_write(phydev->bus, phydev->addr, regnum, val); + return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** @@ -726,7 +722,7 @@ static inline int phy_write_mmd(struct phy_device *phydev, int devad, regnum = MII_ADDR_C45 | ((devad & 0x1f) << 16) | (regnum & 0xffff); - return mdiobus_write(phydev->bus, phydev->addr, regnum, val); + return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** @@ -776,14 +772,14 @@ static inline int phy_read_status(struct phy_device *phydev) } #define phydev_err(_phydev, format, args...) \ - dev_err(&_phydev->dev, format, ##args) + dev_err(&_phydev->mdio.dev, format, ##args) #define phydev_dbg(_phydev, format, args...) \ - dev_dbg(&_phydev->dev, format, ##args) + dev_dbg(&_phydev->mdio.dev, format, ##args); static inline const char *phydev_name(const struct phy_device *phydev) { - return dev_name(&phydev->dev); + return dev_name(&phydev->mdio.dev); } void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) -- cgit v1.2.3 From 7f854420fbfe9d49afe2ffb1df052cfe8e215541 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:18 +0100 Subject: phy: Add API for {un}registering an mdio device to a bus. Rather than have drivers directly manipulate the mii_bus structure, provide and API for registering and unregistering devices on an MDIO bus, and performing lookups. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 8 ++++++++ include/linux/phy.h | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 94f9f1491cde..8cd9579e18ea 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -19,9 +19,12 @@ struct mdio_device { struct mii_bus *bus; /* Bus address of the MDIO device (0-31) */ int addr; + int flags; }; #define to_mdio_device(d) container_of(d, struct mdio_device, dev) +#define MDIO_DEVICE_FLAG_PHY 1 + static inline bool mdio_phy_id_is_c45(int phy_id) { return (phy_id & MDIO_PHY_ID_C45) && !(phy_id & ~MDIO_PHY_ID_C45_MASK); @@ -188,4 +191,9 @@ int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum); int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val); +int mdiobus_register_device(struct mdio_device *mdiodev); +int mdiobus_unregister_device(struct mdio_device *mdiodev); +bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); +struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); + #endif /* __LINUX_MDIO_H__ */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 239a0c2bc49d..2d7beef20825 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -180,7 +180,7 @@ struct mii_bus { struct device dev; /* list of all PHYs on bus */ - struct phy_device *phy_map[PHY_MAX_ADDR]; + struct mdio_device *mdio_map[PHY_MAX_ADDR]; /* PHY addresses to be ignored when probing */ u32 phy_mask; -- cgit v1.2.3 From bc87922ff59d364a33e9bce0febdef21a7fbd2af Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:21 +0100 Subject: phy: Move PHY PM operations into phy_device The MDIO PM operations are really PHY device PM operations. So move them into phy_device. This will be needed when we support devices on the mdio bus which are not PHYs. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 8cd9579e18ea..9f844d372ed5 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -15,7 +15,7 @@ struct mii_bus; struct mdio_device { struct device dev; - + const struct dev_pm_ops *pm_ops; struct mii_bus *bus; /* Bus address of the MDIO device (0-31) */ int addr; -- cgit v1.2.3 From be01da72b1b832b89fbdf59ae6f1b60e53ca2987 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:22 +0100 Subject: phy: Centralize setting driver module owner Rather than have each driver set the driver owner field, do it once in the core code. This will also help with later changes, when the device structure will move. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 2d7beef20825..49e4418822b3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -797,8 +797,9 @@ int genphy_resume(struct phy_device *phydev); int genphy_soft_reset(struct phy_device *phydev); void phy_driver_unregister(struct phy_driver *drv); void phy_drivers_unregister(struct phy_driver *drv, int n); -int phy_driver_register(struct phy_driver *new_driver); -int phy_drivers_register(struct phy_driver *new_driver, int n); +int phy_driver_register(struct phy_driver *new_driver, struct module *owner); +int phy_drivers_register(struct phy_driver *new_driver, int n, + struct module *owner); void phy_state_machine(struct work_struct *work); void phy_change(struct work_struct *work); void phy_mac_interrupt(struct phy_device *phydev, int new_link); @@ -843,7 +844,7 @@ extern struct bus_type mdio_bus_type; #define phy_module_driver(__phy_drivers, __count) \ static int __init phy_module_init(void) \ { \ - return phy_drivers_register(__phy_drivers, __count); \ + return phy_drivers_register(__phy_drivers, __count, THIS_MODULE); \ } \ module_init(phy_module_init); \ static void __exit phy_module_exit(void) \ -- cgit v1.2.3 From e76a4957c5ee5cf69cea89d450c29c536e77ce9e Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:23 +0100 Subject: phy: Move phy specific bus match into phy_device Matching a driver to a device has both generic parts, and parts which are specific to PHY devices. Move the PHY specific parts into phy_device. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 9f844d372ed5..0690359e55a5 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -17,6 +17,7 @@ struct mdio_device { struct device dev; const struct dev_pm_ops *pm_ops; struct mii_bus *bus; + int (*bus_match)(struct device *dev, struct device_driver *drv); /* Bus address of the MDIO device (0-31) */ int addr; int flags; -- cgit v1.2.3 From a9049e0c513c4521dbfaa302af8ed08b3366b41f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:26 +0100 Subject: mdio: Add support for mdio drivers. Not all devices on an MDIO bus are PHYs. Meaning not all MDIO drivers are PHY drivers. Add support for generic MDIO drivers. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/phy.h | 9 +++++---- 2 files changed, 55 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 0690359e55a5..75f7fad0af4f 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -15,6 +15,7 @@ struct mii_bus; struct mdio_device { struct device dev; + const struct dev_pm_ops *pm_ops; struct mii_bus *bus; int (*bus_match)(struct device *dev, struct device_driver *drv); @@ -24,7 +25,37 @@ struct mdio_device { }; #define to_mdio_device(d) container_of(d, struct mdio_device, dev) +/* struct mdio_driver_common: Common to all MDIO drivers */ +struct mdio_driver_common { + struct device_driver driver; + int flags; +}; #define MDIO_DEVICE_FLAG_PHY 1 +#define to_mdio_common_driver(d) \ + container_of(d, struct mdio_driver_common, driver) + +/* struct mdio_driver: Generic MDIO driver */ +struct mdio_driver { + struct mdio_driver_common mdiodrv; + + /* + * Called during discovery. Used to set + * up device-specific structures, if any + */ + int (*probe)(struct mdio_device *mdiodev); + + /* Clears up any memory if needed */ + void (*remove)(struct mdio_device *mdiodev); +}; +#define to_mdio_driver(d) \ + container_of(to_mdio_common_driver(d), struct mdio_driver, mdiodrv) + +void mdio_device_free(struct mdio_device *mdiodev); +struct mdio_device *mdio_device_create(struct mii_bus *bus, int addr); +int mdio_device_register(struct mdio_device *mdiodev); +void mdio_device_remove(struct mdio_device *mdiodev); +int mdio_driver_register(struct mdio_driver *drv); +void mdio_driver_unregister(struct mdio_driver *drv); static inline bool mdio_phy_id_is_c45(int phy_id) { @@ -197,4 +228,23 @@ int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); +/** + * module_mdio_driver() - Helper macro for registering mdio drivers + * + * Helper macro for MDIO drivers which do not do anything special in module + * init/exit. Each module may only use this macro once, and calling it + * replaces module_init() and module_exit(). + */ +#define mdio_module_driver(_mdio_driver) \ +static int __init mdio_module_init(void) \ +{ \ + return mdio_driver_register(&_mdio_driver); \ +} \ +module_init(mdio_module_init); \ +static void __exit mdio_module_exit(void) \ +{ \ + mdio_driver_unregister(&_mdio_driver); \ +} \ +module_exit(mdio_module_exit) + #endif /* __LINUX_MDIO_H__ */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 49e4418822b3..d6f3641e7933 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -60,6 +60,7 @@ #define PHY_HAS_INTERRUPT 0x00000001 #define PHY_HAS_MAGICANEG 0x00000002 #define PHY_IS_INTERNAL 0x00000004 +#define MDIO_DEVICE_IS_PHY 0x80000000 /* Interface Mode definitions */ typedef enum { @@ -432,6 +433,7 @@ struct phy_device { /* struct phy_driver: Driver structure for a particular PHY type * + * driver_data: static driver data * phy_id: The result of reading the UID registers of this PHY * type, and ANDing them with the phy_id_mask. This driver * only works for PHYs with IDs which match this field @@ -441,7 +443,6 @@ struct phy_device { * by this PHY * flags: A bitfield defining certain other features this PHY * supports (like interrupts) - * driver_data: static driver data * * The drivers must implement config_aneg and read_status. All * other functions are optional. Note that none of these @@ -452,6 +453,7 @@ struct phy_device { * supported in the driver). */ struct phy_driver { + struct mdio_driver_common mdiodrv; u32 phy_id; char *name; unsigned int phy_id_mask; @@ -587,10 +589,9 @@ struct phy_driver { void (*get_strings)(struct phy_device *dev, u8 *data); void (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); - - struct device_driver driver; }; -#define to_phy_driver(d) container_of(d, struct phy_driver, driver) +#define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ + struct phy_driver, mdiodrv) #define PHY_ANY_ID "MATCH ANY PHY" #define PHY_ANY_UID 0xffffffff -- cgit v1.2.3 From 711fdba37a3dd7ee487e28767f9f0e67144cbf80 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:27 +0100 Subject: mdio: Abstract device_remove() and device_free() Make device_free and device_remove operations in the mdio device structure, so the core code does not need to differentiate between phy devices and generic mdio devices. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 75f7fad0af4f..5bfd99d1a40a 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -18,7 +18,11 @@ struct mdio_device { const struct dev_pm_ops *pm_ops; struct mii_bus *bus; + int (*bus_match)(struct device *dev, struct device_driver *drv); + void (*device_free)(struct mdio_device *mdiodev); + void (*device_remove)(struct mdio_device *mdiodev); + /* Bus address of the MDIO device (0-31) */ int addr; int flags; -- cgit v1.2.3 From e6d8ecac9e68265aee9be711c5bd29406129666f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Falgueras=20Garc=C3=ADa?= Date: Tue, 5 Jan 2016 14:03:32 +0100 Subject: netfilter: nf_tables: Add new attributes into nft_set to store user data. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User data is stored at after 'nft_set_ops' private data into 'data[]' flexible array. The field 'udata' points to user data and 'udlen' stores its length. Add new flag NFTA_SET_USERDATA. Signed-off-by: Carlos Falgueras García Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0191fbb33a2f..f6b1daf2e698 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -291,6 +291,8 @@ void nft_unregister_set(struct nft_set_ops *ops); * @timeout: default timeout value in msecs * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) + * @udlen: user data length + * @udata: user data * @ops: set ops * @pnet: network namespace * @flags: set flags @@ -310,6 +312,8 @@ struct nft_set { u64 timeout; u32 gc_int; u16 policy; + u16 udlen; + unsigned char *udata; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; possible_net_t pnet; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 731288a039f6..03c28a402c63 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -291,6 +291,7 @@ enum nft_set_desc_attributes { * @NFTA_SET_ID: uniquely identifies a set in a transaction (NLA_U32) * @NFTA_SET_TIMEOUT: default timeout value (NLA_U64) * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32) + * @NFTA_SET_USERDATA: user data (NLA_BINARY) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -306,6 +307,7 @@ enum nft_set_attributes { NFTA_SET_ID, NFTA_SET_TIMEOUT, NFTA_SET_GC_INTERVAL, + NFTA_SET_USERDATA, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) -- cgit v1.2.3 From 48f66c905a976bf0ff092fc24f08d9addd82a245 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 7 Jan 2016 21:34:24 +0100 Subject: netfilter: nft_ct: add byte/packet counter support If the accounting extension isn't present, we'll return a counter value of 0. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 03c28a402c63..be41ffc128b8 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -757,6 +757,8 @@ enum nft_ct_keys { NFT_CT_PROTO_SRC, NFT_CT_PROTO_DST, NFT_CT_LABELS, + NFT_CT_PKTS, + NFT_CT_BYTES, }; /** -- cgit v1.2.3 From 0797cbd8e2741c69a4d416d8f669639a064db8d1 Mon Sep 17 00:00:00 2001 From: Lance Richardson Date: Wed, 6 Jan 2016 17:22:46 -0500 Subject: ipv4: eliminate endianness warnings in ip_fib.h fib_multipath_hash() computes a hash using __be32 values, force cast these to u32 to pacify sparse. Signed-off-by: Lance Richardson Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 9f4df68105ab..7029527725dd 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -325,7 +325,8 @@ extern u32 fib_multipath_secret __read_mostly; static inline int fib_multipath_hash(__be32 saddr, __be32 daddr) { - return jhash_2words(saddr, daddr, fib_multipath_secret) >> 1; + return jhash_2words((__force u32)saddr, (__force u32)daddr, + fib_multipath_secret) >> 1; } void fib_select_multipath(struct fib_result *res, int hash); -- cgit v1.2.3 From 01dd194c387af5b3c4c1f6459d30f596565e466c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 6 Jan 2016 22:32:16 +0100 Subject: bpf: cleanup bpf_prog_run_{save,clear}_cb helpers Move the details behind the cb[] access into a small helper to decouple and make them generic for bpf_prog_run_save_cb()/bpf_prog_run_clear_cb() that was introduced via commit ff936a04e5f2 ("bpf: fix cb access in socket filter programs"). Also add a comment to better clarify what is done in bpf_skb_cb(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index f5b5891ed1ba..43aa1f8855c7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -350,25 +350,43 @@ struct sk_filter { #define BPF_PROG_RUN(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi) +#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN + +static inline u8 *bpf_skb_cb(struct sk_buff *skb) +{ + /* eBPF programs may read/write skb->cb[] area to transfer meta + * data between tail calls. Since this also needs to work with + * tc, that scratch memory is mapped to qdisc_skb_cb's data area. + * + * In some socket filter cases, the cb unfortunately needs to be + * saved/restored so that protocol specific skb->cb[] data won't + * be lost. In any case, due to unpriviledged eBPF programs + * attached to sockets, we need to clear the bpf_skb_cb() area + * to not leak previous contents to user space. + */ + BUILD_BUG_ON(FIELD_SIZEOF(struct __sk_buff, cb) != BPF_SKB_CB_LEN); + BUILD_BUG_ON(FIELD_SIZEOF(struct __sk_buff, cb) != + FIELD_SIZEOF(struct qdisc_skb_cb, data)); + + return qdisc_skb_cb(skb)->data; +} + static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, struct sk_buff *skb) { - u8 *cb_data = qdisc_skb_cb(skb)->data; - u8 saved_cb[QDISC_CB_PRIV_LEN]; + u8 *cb_data = bpf_skb_cb(skb); + u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; - BUILD_BUG_ON(FIELD_SIZEOF(struct __sk_buff, cb) != - QDISC_CB_PRIV_LEN); - if (unlikely(prog->cb_access)) { - memcpy(saved_cb, cb_data, sizeof(saved_cb)); - memset(cb_data, 0, sizeof(saved_cb)); + memcpy(cb_saved, cb_data, sizeof(cb_saved)); + memset(cb_data, 0, sizeof(cb_saved)); } res = BPF_PROG_RUN(prog, skb); if (unlikely(prog->cb_access)) - memcpy(cb_data, saved_cb, sizeof(saved_cb)); + memcpy(cb_data, cb_saved, sizeof(cb_saved)); return res; } @@ -376,10 +394,11 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, struct sk_buff *skb) { - u8 *cb_data = qdisc_skb_cb(skb)->data; + u8 *cb_data = bpf_skb_cb(skb); if (unlikely(prog->cb_access)) - memset(cb_data, 0, QDISC_CB_PRIV_LEN); + memset(cb_data, 0, BPF_SKB_CB_LEN); + return BPF_PROG_RUN(prog, skb); } -- cgit v1.2.3 From 4d41e12593a9a6c4aaf113d44c8c619067b2b0aa Mon Sep 17 00:00:00 2001 From: Elad Raz Date: Sun, 10 Jan 2016 21:06:22 +0100 Subject: switchdev: Adding MDB entry offload Define HW multicast entry: MAC and VID. Using a MAC address simplifies support for both IPV4 and IPv6. Signed-off-by: Elad Raz Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/switchdev.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 603ae2f88dbb..d451122e8404 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -68,6 +68,7 @@ enum switchdev_obj_id { SWITCHDEV_OBJ_ID_PORT_VLAN, SWITCHDEV_OBJ_ID_IPV4_FIB, SWITCHDEV_OBJ_ID_PORT_FDB, + SWITCHDEV_OBJ_ID_PORT_MDB, }; struct switchdev_obj { @@ -113,6 +114,16 @@ struct switchdev_obj_port_fdb { #define SWITCHDEV_OBJ_PORT_FDB(obj) \ container_of(obj, struct switchdev_obj_port_fdb, obj) +/* SWITCHDEV_OBJ_ID_PORT_MDB */ +struct switchdev_obj_port_mdb { + struct switchdev_obj obj; + unsigned char addr[ETH_ALEN]; + u16 vid; +}; + +#define SWITCHDEV_OBJ_PORT_MDB(obj) \ + container_of(obj, struct switchdev_obj_port_mdb, obj) + void switchdev_trans_item_enqueue(struct switchdev_trans *trans, void *data, void (*destructor)(void const *), struct switchdev_trans_item *tritem); -- cgit v1.2.3 From 787d7ac308ff2279e4b2ea393ad4d990de486ef2 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 7 Jan 2016 14:28:39 +0100 Subject: udp: restrict offloads to one namespace udp tunnel offloads tend to aggregate datagrams based on inner headers. gro engine gets notified by tunnel implementations about possible offloads. The match is solely based on the port number. Imagine a tunnel bound to port 53, the offloading will look into all DNS packets and tries to aggregate them based on the inner data found within. This could lead to data corruption and malformed DNS packets. While this patch minimizes the problem and helps an administrator to find the issue by querying ip tunnel/fou, a better way would be to match on the specific destination ip address so if a user space socket is bound to the same address it will conflict. Cc: Tom Herbert Cc: Eric Dumazet Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/protocol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/protocol.h b/include/net/protocol.h index d6fcc1fcdb5b..da689f5432de 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -107,7 +107,7 @@ int inet_del_offload(const struct net_offload *prot, unsigned char num); void inet_register_protosw(struct inet_protosw *p); void inet_unregister_protosw(struct inet_protosw *p); -int udp_add_offload(struct udp_offload *prot); +int udp_add_offload(struct net *net, struct udp_offload *prot); void udp_del_offload(struct udp_offload *prot); #if IS_ENABLED(CONFIG_IPV6) -- cgit v1.2.3 From 13b287e8d1cad951634389f85b8c9b816bd3bb1e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 7 Jan 2016 16:38:43 +0200 Subject: ipv4: Namespaceify tcp_keepalive_time sysctl knob Different net namespaces might have different requirements as to the keepalive time of tcp sockets. This might be required in cases where different firewall rules are in place which require tcp timeout sockets to be increased/decreased independently of the host. Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d75be32650ba..9e9bbebaebd1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -94,6 +94,8 @@ struct netns_ipv4 { int sysctl_tcp_probe_threshold; u32 sysctl_tcp_probe_interval; + int sysctl_tcp_keepalive_time; + struct ping_group_range ping_group_range; atomic_t dev_addr_genid; diff --git a/include/net/tcp.h b/include/net/tcp.h index f33fecf4e282..cb4d4cf25744 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_keepalive_time; extern int sysctl_tcp_keepalive_probes; extern int sysctl_tcp_keepalive_intvl; extern int sysctl_tcp_syn_retries; @@ -1230,7 +1229,9 @@ static inline int keepalive_intvl_when(const struct tcp_sock *tp) static inline int keepalive_time_when(const struct tcp_sock *tp) { - return tp->keepalive_time ? : sysctl_tcp_keepalive_time; + struct net *net = sock_net((struct sock *)tp); + + return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time; } static inline int keepalive_probes(const struct tcp_sock *tp) -- cgit v1.2.3 From 9bd6861bd4326e3afd3f14a9ec8a723771fb20bb Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 7 Jan 2016 16:38:44 +0200 Subject: ipv4: Namespecify tcp_keepalive_probes sysctl knob This is required to have full tcp keepalive mechanism namespace support. Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9e9bbebaebd1..6e26ea2d0374 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -95,6 +95,7 @@ struct netns_ipv4 { u32 sysctl_tcp_probe_interval; int sysctl_tcp_keepalive_time; + int sysctl_tcp_keepalive_probes; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index cb4d4cf25744..0646521400bf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_keepalive_probes; extern int sysctl_tcp_keepalive_intvl; extern int sysctl_tcp_syn_retries; extern int sysctl_tcp_synack_retries; @@ -1236,7 +1235,9 @@ static inline int keepalive_time_when(const struct tcp_sock *tp) static inline int keepalive_probes(const struct tcp_sock *tp) { - return tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; + struct net *net = sock_net((struct sock *)tp); + + return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes; } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) -- cgit v1.2.3 From b840d15d39128d08ed4486085e5507d2617b9ae1 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 7 Jan 2016 16:38:45 +0200 Subject: ipv4: Namespecify the tcp_keepalive_intvl sysctl knob This is the final part required to namespaceify the tcp keep alive mechanism. Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 6e26ea2d0374..2b7907a35568 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -96,6 +96,7 @@ struct netns_ipv4 { int sysctl_tcp_keepalive_time; int sysctl_tcp_keepalive_probes; + int sysctl_tcp_keepalive_intvl; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0646521400bf..a80255f4ca33 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_keepalive_intvl; extern int sysctl_tcp_syn_retries; extern int sysctl_tcp_synack_retries; extern int sysctl_tcp_retries1; @@ -1223,7 +1222,9 @@ void tcp_enter_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) { - return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl; + struct net *net = sock_net((struct sock *)tp); + + return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl; } static inline int keepalive_time_when(const struct tcp_sock *tp) -- cgit v1.2.3 From fdc5432a7b44ab7de17141beec19d946b9344e91 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 7 Jan 2016 15:50:22 +0100 Subject: net, sched: add skb_at_tc_ingress helper Add a skb_at_tc_ingress() as this will be needed elsewhere as well and can hide the ugly ifdef. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/sch_generic.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index b2a8e6338576..636a362a0e03 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -407,6 +407,15 @@ bool tcf_destroy(struct tcf_proto *tp, bool force); void tcf_destroy_chain(struct tcf_proto __rcu **fl); int skb_do_redirect(struct sk_buff *); +static inline bool skb_at_tc_ingress(const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_CLS_ACT + return G_TC_AT(skb->tc_verd) & AT_INGRESS; +#else + return false; +#endif +} + /* Reset all TX qdiscs greater then index of a device. */ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) { -- cgit v1.2.3 From f8ffad69c9f8b8dfb0b633425d4ef4d2493ba61a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 7 Jan 2016 15:50:23 +0100 Subject: bpf: add skb_postpush_rcsum and fix dev_forward_skb occasions Add a small helper skb_postpush_rcsum() and fix up redirect locations that need CHECKSUM_COMPLETE fixups on ingress. dev_forward_skb() expects a proper csum that covers also Ethernet header, f.e. since 2c26d34bbcc0 ("net/core: Handle csum for CHECKSUM_COMPLETE VXLAN forwarding"), we also do skb_postpull_rcsum() after pulling Ethernet header off via eth_type_trans(). When using eBPF in a netns setup f.e. with vxlan in collect metadata mode, I can trigger the following csum issue with an IPv6 setup: [ 505.144065] dummy1: hw csum failure [...] [ 505.144108] Call Trace: [ 505.144112] [] dump_stack+0x44/0x5c [ 505.144134] [] netdev_rx_csum_fault+0x3a/0x40 [ 505.144142] [] __skb_checksum_complete+0xcf/0xe0 [ 505.144149] [] nf_ip6_checksum+0xb2/0x120 [ 505.144161] [] icmpv6_error+0x17e/0x328 [nf_conntrack_ipv6] [ 505.144170] [] ? ip6t_do_table+0x2fa/0x645 [ip6_tables] [ 505.144177] [] ? ipv6_get_l4proto+0x65/0xd0 [nf_conntrack_ipv6] [ 505.144189] [] nf_conntrack_in+0xc2/0x5a0 [nf_conntrack] [ 505.144196] [] ipv6_conntrack_in+0x1c/0x20 [nf_conntrack_ipv6] [ 505.144204] [] nf_iterate+0x5d/0x70 [ 505.144210] [] nf_hook_slow+0x66/0xc0 [ 505.144218] [] ipv6_rcv+0x3f2/0x4f0 [ 505.144225] [] ? ip6_make_skb+0x1b0/0x1b0 [ 505.144232] [] __netif_receive_skb_core+0x36b/0x9a0 [ 505.144239] [] ? __netif_receive_skb+0x18/0x60 [ 505.144245] [] __netif_receive_skb+0x18/0x60 [ 505.144252] [] process_backlog+0x9f/0x140 [ 505.144259] [] net_rx_action+0x145/0x320 [...] What happens is that on ingress, we push Ethernet header back in, either from cls_bpf or right before skb_do_redirect(), but without updating csum. The "hw csum failure" can be fixed by using the new skb_postpush_rcsum() helper for the dev_forward_skb() case to correct the csum diff again. Thanks to Hannes Frederic Sowa for the csum_partial() idea! Fixes: 3896d655f4d4 ("bpf: introduce bpf_clone_redirect() helper") Fixes: 27b29f63058d ("bpf: add bpf_redirect() helper") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6b6bd42d6134..07f9ccd28654 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2805,6 +2805,23 @@ static inline void skb_postpull_rcsum(struct sk_buff *skb, unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); +static inline void skb_postpush_rcsum(struct sk_buff *skb, + const void *start, unsigned int len) +{ + /* For performing the reverse operation to skb_postpull_rcsum(), + * we can instead of ... + * + * skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); + * + * ... just use this equivalent version here to save a few + * instructions. Feeding csum of 0 in csum_partial() and later + * on adding skb->csum is equivalent to feed skb->csum in the + * first place. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_partial(start, len, skb->csum); +} + /** * pskb_trim_rcsum - trim received skb and update checksum * @skb: buffer to trim -- cgit v1.2.3 From 1f211a1b929c804100e138c5d3d656992cfd5622 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 7 Jan 2016 22:29:47 +0100 Subject: net, sched: add clsact qdisc This work adds a generalization of the ingress qdisc as a qdisc holding only classifiers. The clsact qdisc works on ingress, but also on egress. In both cases, it's execution happens without taking the qdisc lock, and the main difference for the egress part compared to prior version of [1] is that this can be applied with _any_ underlying real egress qdisc (also classless ones). Besides solving the use-case of [1], that is, allowing for more programmability on assigning skb->priority for the mqprio case that is supported by most popular 10G+ NICs, it also opens up a lot more flexibility for other tc applications. The main work on classification can already be done at clsact egress time if the use-case allows and state stored for later retrieval f.e. again in skb->priority with major/minors (which is checked by most classful qdiscs before consulting tc_classify()) and/or in other skb fields like skb->tc_index for some light-weight post-processing to get to the eventual classid in case of a classful qdisc. Another use case is that the clsact egress part allows to have a central egress counterpart to the ingress classifiers, so that classifiers can easily share state (e.g. in cls_bpf via eBPF maps) for ingress and egress. Currently, default setups like mq + pfifo_fast would require for this to use, for example, prio qdisc instead (to get a tc_classify() run) and to duplicate the egress classifier for each queue. With clsact, it allows for leaving the setup as is, it can additionally assign skb->priority to put the skb in one of pfifo_fast's bands and it can share state with maps. Moreover, we can access the skb's dst entry (f.e. to retrieve tclassid) w/o the need to perform a skb_dst_force() to hold on to it any longer. In lwt case, we can also use this facility to setup dst metadata via cls_bpf (bpf_skb_set_tunnel_key()) without needing a real egress qdisc just for that (case of IFF_NO_QUEUE devices, for example). The realization can be done without any changes to the scheduler core framework. All it takes is that we have two a-priori defined minors/child classes, where we can mux between ingress and egress classifier list (dev->ingress_cl_list and dev->egress_cl_list, latter stored close to dev->_tx to avoid extra cacheline miss for moderate loads). The egress part is a bit similar modelled to handle_ing() and patched to a noop in case the functionality is not used. Both handlers are now called sch_handle_ingress() and sch_handle_egress(), code sharing among the two doesn't seem practical as there are various minor differences in both paths, so that making them conditional in a single handler would rather slow things down. Full compatibility to ingress qdisc is provided as well. Since both piggyback on TC_H_CLSACT, only one of them (ingress/clsact) can exist per netdevice, and thus ingress qdisc specific behaviour can be retained for user space. This means, either a user does 'tc qdisc add dev foo ingress' and configures ingress qdisc as usual, or the 'tc qdisc add dev foo clsact' alternative, where both, ingress and egress classifier can be configured as in the below example. ingress qdisc supports attaching classifier to any minor number whereas clsact has two fixed minors for muxing between the lists, therefore to not break user space setups, they are better done as two separate qdiscs. I decided to extend the sch_ingress module with clsact functionality so that commonly used code can be reused, the module is being aliased with sch_clsact so that it can be auto-loaded properly. Alternative would have been to add a flag when initializing ingress to alter its behaviour plus aliasing to a different name (as it's more than just ingress). However, the first would end up, based on the flag, choosing the new/old behaviour by calling different function implementations to handle each anyway, the latter would require to register ingress qdisc once again under different alias. So, this really begs to provide a minimal, cleaner approach to have Qdisc_ops and Qdisc_class_ops by its own that share callbacks used by both. Example, adding qdisc: # tc qdisc add dev foo clsact # tc qdisc show dev foo qdisc mq 0: root qdisc pfifo_fast 0: parent :1 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :2 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :3 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :4 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc clsact ffff: parent ffff:fff1 Adding filters (deleting, etc works analogous by specifying ingress/egress): # tc filter add dev foo ingress bpf da obj bar.o sec ingress # tc filter add dev foo egress bpf da obj bar.o sec egress # tc filter show dev foo ingress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[ingress] direct-action # tc filter show dev foo egress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[egress] direct-action A 'tc filter show dev foo' or 'tc filter show dev foo parent ffff:' will show an empty list for clsact. Either using the parent names (ingress/egress) or specifying the full major/minor will then show the related filter lists. Prior work on a mqprio prequeue() facility [1] was done mainly by John Fastabend. [1] http://patchwork.ozlabs.org/patch/512949/ Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- include/linux/rtnetlink.h | 5 +++++ include/uapi/linux/pkt_sched.h | 4 ++++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8d8e5ca951b4..2285596e7045 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1739,7 +1739,9 @@ struct net_device { #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_maps; #endif - +#ifdef CONFIG_NET_CLS_ACT + struct tcf_proto __rcu *egress_cl_list; +#endif #ifdef CONFIG_NET_SWITCHDEV u32 offload_fwd_mark; #endif diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 4be5048b1fbe..c006cc900c44 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -84,6 +84,11 @@ void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); #endif +#ifdef CONFIG_NET_EGRESS +void net_inc_egress_queue(void); +void net_dec_egress_queue(void); +#endif + extern void rtnetlink_init(void); extern void __rtnl_unlock(void); diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 8d2530daca9f..8cb18b44968e 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -72,6 +72,10 @@ struct tc_estimator { #define TC_H_UNSPEC (0U) #define TC_H_ROOT (0xFFFFFFFFU) #define TC_H_INGRESS (0xFFFFFFF1U) +#define TC_H_CLSACT TC_H_INGRESS + +#define TC_H_MIN_INGRESS 0xFFF2U +#define TC_H_MIN_EGRESS 0xFFF3U /* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ enum tc_link_layer { -- cgit v1.2.3 From 712f4aad406bb1ed67f3f98d04c044191f0ff593 Mon Sep 17 00:00:00 2001 From: willy tarreau Date: Sun, 10 Jan 2016 07:54:56 +0100 Subject: unix: properly account for FDs passed over unix sockets It is possible for a process to allocate and accumulate far more FDs than the process' limit by sending them over a unix socket then closing them to keep the process' fd count low. This change addresses this problem by keeping track of the number of FDs in flight per user and preventing non-privileged processes from having more FDs in flight than their configured FD limit. Reported-by: socketpair@gmail.com Reported-by: Tetsuo Handa Mitigates: CVE-2013-4312 (Linux 2.0+) Suggested-by: Linus Torvalds Acked-by: Hannes Frederic Sowa Signed-off-by: Willy Tarreau Signed-off-by: David S. Miller --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index edad7a43edea..fbf25f19b3b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -830,6 +830,7 @@ struct user_struct { unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ #endif unsigned long locked_shm; /* How many pages of mlocked shm ? */ + unsigned long unix_inflight; /* How many files in flight in unix sockets */ #ifdef CONFIG_KEYS struct key *uid_keyring; /* UID specific keyring */ -- cgit v1.2.3 From 781c53bc5d5628065a46c70f02f5a0450f5842f4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Jan 2016 01:16:38 +0100 Subject: bpf: export helper function flags and reject invalid ones Export flags used by eBPF helper functions through UAPI, so they can be used by programs (instead of them redefining all flags each time or just using the hard-coded values). It also gives a better overview what flags are used where and we can further get rid of the extra macros defined in filter.c. Moreover, reject invalid flags. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8bed7f1176b8..d94797ce9a5a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -273,6 +273,22 @@ enum bpf_func_id { __BPF_FUNC_MAX_ID, }; +/* All flags used by eBPF helper functions, placed here. */ + +/* BPF_FUNC_skb_store_bytes flags. */ +#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) + +/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. + * First 4 bits are for passing the header field size. + */ +#define BPF_F_HDR_FIELD_MASK 0xfULL + +/* BPF_FUNC_l4_csum_replace flags. */ +#define BPF_F_PSEUDO_HDR (1ULL << 4) + +/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ +#define BPF_F_INGRESS (1ULL << 0) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ -- cgit v1.2.3 From c6c33454072fc9fe961e2b25f22a619e4fa98838 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Jan 2016 01:16:39 +0100 Subject: bpf: support ipv6 for bpf_skb_{set,get}_tunnel_key After IPv6 support has recently been added to metadata dst and related encaps, add support for populating/reading it from an eBPF program. Commit d3aa45ce6b ("bpf: add helpers to access tunnel metadata") started with initial IPv4-only support back then (due to IPv6 metadata support not being available yet). To stay compatible with older programs, we need to test for the passed structure size. Also TOS and TTL support from the ip_tunnel_info key has been added. Tested with vxlan devs in collect meta data mode with IPv4, IPv6 and in compat mode over different network namespaces. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d94797ce9a5a..aa6f8571de13 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -289,6 +289,9 @@ enum bpf_func_id { /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ #define BPF_F_INGRESS (1ULL << 0) +/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ +#define BPF_F_TUNINFO_IPV6 (1ULL << 0) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -312,7 +315,12 @@ struct __sk_buff { struct bpf_tunnel_key { __u32 tunnel_id; - __u32 remote_ipv4; + union { + __u32 remote_ipv4; + __u32 remote_ipv6[4]; + }; + __u8 tunnel_tos; + __u8 tunnel_ttl; }; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From f0d22d1874730530a2ac304fd0888cb8a6864527 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:25:57 +0200 Subject: net/mlx5_core: Introduce flow steering autogrouped flow table When user add rule to autogrouped flow table, we search for flow group with the same match criteria, if we don't find such group then we create new flow group with the required match criteria and insert the rule to this group. We divide the flow table into required_groups + 1, in order to reserve a part of the flow table for rules which don't match any existing group. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index bc7ad019afde..06ac6e8fccfa 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -61,6 +61,12 @@ struct mlx5_flow_namespace * mlx5_get_flow_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type type); +struct mlx5_flow_table * +mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, + int prio, + int num_flow_table_entries, + int max_num_groups); + struct mlx5_flow_table * mlx5_create_flow_table(struct mlx5_flow_namespace *ns, int prio, -- cgit v1.2.3 From 2cc43b494a6c30ec0e554ea91ce763c97069e8cc Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:25:59 +0200 Subject: net/mlx5_core: Managing root flow table The root Flow Table for each Flow Table Type is defined, by default, as the Flow Table with level 0. In order not to use an empty flow tables and introduce new hops, but still preserve space for flow-tables that have a priority greater(lower number) than the current flow table, we introduce this new set root flow table command. This command tells the HW to start matching packets from the assigned root flow table. This command is used when we create new flow table with level lower than the current lowest flow table or it is the first flow table. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1780a85a8797..323e713c44ba 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -185,6 +185,7 @@ enum { MLX5_CMD_OP_MODIFY_RQT = 0x917, MLX5_CMD_OP_DESTROY_RQT = 0x918, MLX5_CMD_OP_QUERY_RQT = 0x919, + MLX5_CMD_OP_SET_FLOW_TABLE_ROOT = 0x92f, MLX5_CMD_OP_CREATE_FLOW_TABLE = 0x930, MLX5_CMD_OP_DESTROY_FLOW_TABLE = 0x931, MLX5_CMD_OP_QUERY_FLOW_TABLE = 0x932, @@ -258,7 +259,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 ft_support[0x1]; u8 reserved_0[0x2]; u8 flow_modify_en[0x1]; - u8 reserved_1[0x1c]; + u8 modify_root[0x1]; + u8 reserved_1[0x1b]; u8 reserved_2[0x2]; u8 log_max_ft_size[0x6]; @@ -6946,4 +6948,31 @@ union mlx5_ifc_uplink_pci_interface_document_bits { u8 reserved_0[0x20060]; }; +struct mlx5_ifc_set_flow_table_root_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_set_flow_table_root_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 table_type[0x8]; + u8 reserved_3[0x18]; + + u8 reserved_4[0x8]; + u8 table_id[0x18]; + + u8 reserved_5[0x140]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From 34a40e689393a6b13673ab395a9a4d063d249fe9 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:26:00 +0200 Subject: net/mlx5_core: Introduce modify flow table command Introduce the modify flow table command. This command is used when we want to change the next flow table of an existing flow table. The next flow table is defined as the table we search (in order to find a match), if we couldn't find a match in any of the flow table entries in the current flow table. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 56 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 323e713c44ba..7f166955d4c9 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -194,7 +194,8 @@ enum { MLX5_CMD_OP_QUERY_FLOW_GROUP = 0x935, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY = 0x936, MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY = 0x937, - MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY = 0x938 + MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY = 0x938, + MLX5_CMD_OP_MODIFY_FLOW_TABLE = 0x93c }; struct mlx5_ifc_flow_table_fields_supported_bits { @@ -260,7 +261,9 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reserved_0[0x2]; u8 flow_modify_en[0x1]; u8 modify_root[0x1]; - u8 reserved_1[0x1b]; + u8 identified_miss_table_mode[0x1]; + u8 flow_table_modify[0x1]; + u8 reserved_1[0x19]; u8 reserved_2[0x2]; u8 log_max_ft_size[0x6]; @@ -5669,12 +5672,16 @@ struct mlx5_ifc_create_flow_table_in_bits { u8 reserved_4[0x20]; - u8 reserved_5[0x8]; + u8 reserved_5[0x4]; + u8 table_miss_mode[0x4]; u8 level[0x8]; u8 reserved_6[0x8]; u8 log_size[0x8]; - u8 reserved_7[0x120]; + u8 reserved_7[0x8]; + u8 table_miss_id[0x18]; + + u8 reserved_8[0x100]; }; struct mlx5_ifc_create_flow_group_out_bits { @@ -6975,4 +6982,45 @@ struct mlx5_ifc_set_flow_table_root_in_bits { u8 reserved_5[0x140]; }; +enum { + MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID = 0x1, +}; + +struct mlx5_ifc_modify_flow_table_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_flow_table_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x20]; + + u8 reserved_3[0x10]; + u8 modify_field_select[0x10]; + + u8 table_type[0x8]; + u8 reserved_4[0x18]; + + u8 reserved_5[0x8]; + u8 table_id[0x18]; + + u8 reserved_6[0x4]; + u8 table_miss_mode[0x4]; + u8 reserved_7[0x18]; + + u8 reserved_8[0x8]; + u8 table_miss_id[0x18]; + + u8 reserved_9[0x100]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From 4cbdd30ed5c8bc5cf40813b025b4fb57b376a592 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:26:04 +0200 Subject: net/mlx5_core: Enable flow steering support for the IB driver When the driver is loaded, we create flow steering namespace for kernel bypass with nine priorities and another namespace for leftovers(in order to catch packets that weren't matched). Verbs applications will use these priorities. we found nine as a number that balances the requirements from the user and retains performance. The bypass namespace is used by verbs applications that want to bypass the kernel networking stack. The leftovers namespace is used by verbs applications and the sniffer in order to catch packets that weren't handled by any preceding rules. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 2 ++ include/linux/mlx5/fs.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index df2f79ef3cac..7be845e30689 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1258,4 +1258,6 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; } +#define MLX5_BY_PASS_NUM_PRIOS 9 + #endif /* MLX5_DEVICE_H */ diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 06ac6e8fccfa..a94341271e3f 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -39,7 +39,9 @@ #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 enum mlx5_flow_namespace_type { + MLX5_FLOW_NAMESPACE_BYPASS, MLX5_FLOW_NAMESPACE_KERNEL, + MLX5_FLOW_NAMESPACE_LEFTOVERS, MLX5_FLOW_NAMESPACE_FDB, }; -- cgit v1.2.3 From b4d1f032d75b2efb73304e8c12faa7149ad700c7 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:26:05 +0200 Subject: net/mlx5_core: Make ipv4/ipv6 location more clear Change the mlx5 firmware interface header to make it more clear which bytes should be used by IPv4 or IPv6 addresses. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 7f166955d4c9..68d73f82e009 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -298,6 +298,22 @@ struct mlx5_ifc_odp_per_transport_service_cap_bits { u8 reserved_1[0x1a]; }; +struct mlx5_ifc_ipv4_layout_bits { + u8 reserved_0[0x60]; + + u8 ipv4[0x20]; +}; + +struct mlx5_ifc_ipv6_layout_bits { + u8 ipv6[16][0x8]; +}; + +union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits { + struct mlx5_ifc_ipv6_layout_bits ipv6_layout; + struct mlx5_ifc_ipv4_layout_bits ipv4_layout; + u8 reserved_0[0x80]; +}; + struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 smac_47_16[0x20]; @@ -328,9 +344,9 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 udp_sport[0x10]; u8 udp_dport[0x10]; - u8 src_ip[4][0x20]; + union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6; - u8 dst_ip[4][0x20]; + union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6; }; struct mlx5_ifc_fte_match_set_misc_bits { -- cgit v1.2.3 From 038d2ef87572757861a177b19f9d489def2c48b8 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:26:07 +0200 Subject: IB/mlx5: Add flow steering support Adding flow steering support by creating a flow-table per priority (if rules exist in the priority). mlx5_ib uses autogrouping and thus only creates the required destinations. Also includes adding of these flow steering utilities 1. Parsing verbs flow attributes hardware steering specs. 2. Check if flow is multicast - this is required in order to decide to which flow table will we add the steering rule. 3. Set outer headers in flow match criteria to zeros. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index a94341271e3f..8230caa3fb6e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -38,6 +38,16 @@ #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 +#define LEFTOVERS_RULE_NUM 2 +static inline void build_leftovers_ft_param(int *priority, + int *n_ent, + int *n_grp) +{ + *priority = 0; /* Priority of leftovers_prio-0 */ + *n_ent = LEFTOVERS_RULE_NUM; + *n_grp = LEFTOVERS_RULE_NUM; +} + enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_BYPASS, MLX5_FLOW_NAMESPACE_KERNEL, -- cgit v1.2.3 From b6a0e72ad3cffabaf30b856deb58fbe64a0f36a8 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 11 Jan 2016 10:19:10 -0800 Subject: net: Fix typo in netdev_intersect_features Obviously need to 'or in NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM. Fixes: c8cd0989bd151f ("net: Eliminate NETIF_F_GEN_CSUM and NETIF_F_V[46]_CSUM") Reported-by: Jack Morgenstein Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2285596e7045..5ac140dcb789 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3873,9 +3873,9 @@ static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, { if ((f1 ^ f2) & NETIF_F_HW_CSUM) { if (f1 & NETIF_F_HW_CSUM) - f1 |= (NETIF_F_IP_CSUM|NETIF_F_IP_CSUM); + f1 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); else - f2 |= (NETIF_F_IP_CSUM|NETIF_F_IP_CSUM); + f2 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } return f1 & f2; -- cgit v1.2.3