From 52bd2d62ce6758d811edcbd2256eb9ea7f6a56cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:50 -0800 Subject: net: better skb->sender_cpu and skb->napi_id cohabitation skb->sender_cpu and skb->napi_id share a common storage, and we had various bugs about this. We had to call skb_sender_cpu_clear() in some places to not leave a prior skb->napi_id and fool netdev_pick_tx() As suggested by Alexei, we could split the space so that these errors can not happen. 0 value being reserved as the common (not initialized) value, let's reserve [1 .. NR_CPUS] range for valid sender_cpu, and [NR_CPUS+1 .. ~0U] for valid napi_id. This will allow proper busy polling support over tunnels. Signed-off-by: Eric Dumazet Suggested-by: Alexei Starovoitov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4355129fff91..c9c394bf0771 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1082,9 +1082,6 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) static inline void skb_sender_cpu_clear(struct sk_buff *skb) { -#ifdef CONFIG_XPS - skb->sender_cpu = 0; -#endif } #ifdef NET_SKBUFF_DATA_USES_OFFSET -- cgit v1.2.3 From 02d62e86fe892c59a1259d089d4d16ac76977a37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:52 -0800 Subject: net: un-inline sk_busy_loop() There is really little gain from inlining this big function. We'll soon make it even bigger in following patches. This means we no longer need to export napi_by_id() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67bfac1abfc1..2020a89df12b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -460,15 +460,6 @@ static inline void napi_complete(struct napi_struct *n) return napi_complete_done(n, 0); } -/** - * napi_by_id - lookup a NAPI by napi_id - * @napi_id: hashed napi_id - * - * lookup @napi_id in napi_hash table - * must be called under rcu_read_lock() - */ -struct napi_struct *napi_by_id(unsigned int napi_id); - /** * napi_hash_add - add a NAPI to global hashtable * @napi: napi context -- cgit v1.2.3 From d64b5e85bfe2fe4c790abcbd16d9ae32391ddd7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:00 -0800 Subject: net: add netif_tx_napi_add() netif_tx_napi_add() is a variant of netif_napi_add() It should be used by drivers that use a napi structure to exclusively poll TX. We do not want to add this kind of napi in napi_hash[] in following patches, adding generic busy polling to all NAPI drivers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2020a89df12b..838935d1cdbb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -326,7 +326,8 @@ enum { NAPI_STATE_SCHED, /* Poll is scheduled */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ - NAPI_STATE_HASHED, /* In NAPI hash */ + NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ + NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ }; enum gro_result { @@ -1938,6 +1939,26 @@ static inline void *netdev_priv(const struct net_device *dev) void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight); +/** + * netif_tx_napi_add - initialize a napi context + * @dev: network device + * @napi: napi context + * @poll: polling function + * @weight: default weight + * + * This variant of netif_napi_add() should be used from drivers using NAPI + * to exclusively poll a TX queue. + * This will avoid we add it into napi_hash[], thus polluting this hash table. + */ +static inline void netif_tx_napi_add(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) +{ + set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); + netif_napi_add(dev, napi, poll, weight); +} + /** * netif_napi_del - remove a napi context * @napi: napi context -- cgit v1.2.3 From 6180d9de61a5c461f9e3efef5417a844701dbbb2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:01 -0800 Subject: net: move napi_hash[] into read mostly section We do not often add/delete a napi context. Moving napi_hash[] into read mostly section avoids potential false sharing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/hashtable.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h index 519b6e2d769e..661e5c2a8e2a 100644 --- a/include/linux/hashtable.h +++ b/include/linux/hashtable.h @@ -16,6 +16,10 @@ struct hlist_head name[1 << (bits)] = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } +#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] __read_mostly = \ + { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } + #define DECLARE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] -- cgit v1.2.3 From 34cbe27e811c591c854a39c0dee1b461bb796953 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:02 -0800 Subject: net: napi_hash_del() returns a boolean status napi_hash_del() will soon be used from both drivers (if they want) or core networking stack. Callers are responsibles to ensure an RCU grace period is respected before freeing napi structure : napi_hash_del() can signal if this RCU grace period is needed or not. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 838935d1cdbb..e5c33b29471b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -474,9 +474,10 @@ void napi_hash_add(struct napi_struct *napi); * @napi: napi context * * Warning: caller must observe rcu grace period - * before freeing memory containing @napi + * before freeing memory containing @napi, if + * this function returns true. */ -void napi_hash_del(struct napi_struct *napi); +bool napi_hash_del(struct napi_struct *napi); /** * napi_disable - prevent NAPI from scheduling -- cgit v1.2.3 From 93d05d4a320cb16712bb3d57a9658f395d8cecb9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:03 -0800 Subject: net: provide generic busy polling to all NAPI drivers NAPI drivers no longer need to observe a particular protocol to benefit from busy polling (CONFIG_NET_RX_BUSY_POLL=y) napi_hash_add() and napi_hash_del() are automatically called from core networking stack, respectively from netif_napi_add() and netif_napi_del() This patch depends on free_netdev() and netif_napi_del() being called from process context, which seems to be the norm. Drivers might still prefer to call napi_hash_del() on their own, since they might combine all the rcu grace periods into a single one, knowing their NAPI structures lifetime, while core networking stack has no idea of a possible combining. Once this patch proves to not bring serious regressions, we will cleanup drivers to either remove napi_hash_del() or provide appropriate rcu grace periods combining. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e5c33b29471b..7d2d1d7aaec7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -466,6 +466,9 @@ static inline void napi_complete(struct napi_struct *n) * @napi: napi context * * generate a new napi_id and store a @napi under it in napi_hash + * Used for busy polling (CONFIG_NET_RX_BUSY_POLL) + * Note: This is normally automatically done from netif_napi_add(), + * so might disappear in a future linux version. */ void napi_hash_add(struct napi_struct *napi); @@ -476,6 +479,10 @@ void napi_hash_add(struct napi_struct *napi); * Warning: caller must observe rcu grace period * before freeing memory containing @napi, if * this function returns true. + * Note: core networking stack automatically calls it + * from netif_napi_del() + * Drivers might want to call this helper to combine all + * the needed rcu grace periods into a single one. */ bool napi_hash_del(struct napi_struct *napi); -- cgit v1.2.3 From a8acce6aa584aa731a2bed240bcd8dc955f01414 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 19 Nov 2015 12:53:21 +0100 Subject: ppp: remove PPPOX_ZOMBIE socket state PPPOX_ZOMBIE is never set anymore. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/linux/if_pppox.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index b49cf923becc..ba7a9b0c7c57 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -91,7 +91,6 @@ enum { PPPOX_CONNECTED = 1, /* connection established ==TCP_ESTABLISHED */ PPPOX_BOUND = 2, /* bound to ppp device */ PPPOX_RELAY = 4, /* forwarding is enabled */ - PPPOX_ZOMBIE = 8, /* dead, but still bound to ppp device */ PPPOX_DEAD = 16 /* dead, useless, please clean me up!*/ }; -- cgit v1.2.3 From b11cfb5807e30333b36c02701382b820b7dcf0d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Nov 2015 15:55:52 -0500 Subject: cgroup: record ancestor IDs and reimplement cgroup_is_descendant() using it cgroup_is_descendant() currently walks up the hierarchy and compares each ancestor to the cgroup in question. While enough for cgroup core usages, this can't be used in hot paths to test cgroup membership. This patch adds cgroup->ancestor_ids[] which records the IDs of all ancestors including self and cgroup->level for the nesting level. This allows testing whether a given cgroup is a descendant of another in three finite steps - testing whether the two belong to the same hierarchy, whether the descendant candidate is at the same or a higher level than the ancestor and comparing the recorded ancestor_id at the matching level. cgroup_is_descendant() is accordingly reimplmented and made inline. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 14 ++++++++++++++ include/linux/cgroup.h | 18 +++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 60d44b26276d..504d8591b6d3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -234,6 +234,14 @@ struct cgroup { */ int id; + /* + * The depth this cgroup is at. The root is at depth zero and each + * step down the hierarchy increments the level. This along with + * ancestor_ids[] can determine whether a given cgroup is a + * descendant of another without traversing the hierarchy. + */ + int level; + /* * Each non-empty css_set associated with this cgroup contributes * one to populated_cnt. All children with non-zero popuplated_cnt @@ -289,6 +297,9 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work; + + /* ids of the ancestors at each level including self */ + int ancestor_ids[]; }; /* @@ -308,6 +319,9 @@ struct cgroup_root { /* The root cgroup. Root is destroyed on its release. */ struct cgroup cgrp; + /* for cgrp->ancestor_ids[0] */ + int cgrp_ancestor_id_storage; + /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 22e3754f89c5..b5ee2c4210f9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -81,7 +81,6 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); @@ -459,6 +458,23 @@ static inline struct cgroup *task_cgroup(struct task_struct *task, return task_css(task, subsys_id)->cgroup; } +/** + * cgroup_is_descendant - test ancestry + * @cgrp: the cgroup to be tested + * @ancestor: possible ancestor of @cgrp + * + * Test whether @cgrp is a descendant of @ancestor. It also returns %true + * if @cgrp == @ancestor. This function is safe to call as long as @cgrp + * and @ancestor are accessible. + */ +static inline bool cgroup_is_descendant(struct cgroup *cgrp, + struct cgroup *ancestor) +{ + if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) + return false; + return cgrp->ancestor_ids[ancestor->level] == ancestor->id; +} + /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { -- cgit v1.2.3 From bd96f76a2454c6b97d70945902e30b4c31510678 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Nov 2015 15:55:52 -0500 Subject: kernfs: implement kernfs_walk_and_get() Implement kernfs_walk_and_get() which is similar to kernfs_find_and_get() but can walk a path instead of just a name. v2: Use strlcpy() instead of strlen() + memcpy() as suggested by David. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Cc: David Miller --- include/linux/kernfs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5d4e9c4b821d..af51df35d749 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -274,6 +274,8 @@ void pr_cont_kernfs_path(struct kernfs_node *kn); struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns); +struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, + const char *path, const void *ns); void kernfs_get(struct kernfs_node *kn); void kernfs_put(struct kernfs_node *kn); @@ -350,6 +352,10 @@ static inline struct kernfs_node * kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { return NULL; } +static inline struct kernfs_node * +kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, + const void *ns) +{ return NULL; } static inline void kernfs_get(struct kernfs_node *kn) { } static inline void kernfs_put(struct kernfs_node *kn) { } @@ -430,6 +436,12 @@ kernfs_find_and_get(struct kernfs_node *kn, const char *name) return kernfs_find_and_get_ns(kn, name, NULL); } +static inline struct kernfs_node * +kernfs_walk_and_get(struct kernfs_node *kn, const char *path) +{ + return kernfs_walk_and_get_ns(kn, path, NULL); +} + static inline struct kernfs_node * kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, void *priv) -- cgit v1.2.3 From 16af439645455fbf36984ca5e72f31073ee19ab7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Nov 2015 15:55:52 -0500 Subject: cgroup: implement cgroup_get_from_path() and expose cgroup_put() Implement cgroup_get_from_path() using kernfs_walk_and_get() which obtains a default hierarchy cgroup from its path. This will be used to allow cgroup path based matching from outside cgroup proper - e.g. networking and perf. v2: Add EXPORT_SYMBOL_GPL(cgroup_get_from_path). Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b5ee2c4210f9..4c3ffab81ba7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -81,6 +81,8 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); +struct cgroup *cgroup_get_from_path(const char *path); + int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); @@ -351,6 +353,11 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) percpu_ref_put_many(&css->refcnt, n); } +static inline void cgroup_put(struct cgroup *cgrp) +{ + css_put(&cgrp->self); +} + /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for -- cgit v1.2.3 From f7ccdb96fa31305d480678b1ba81225907dd81ef Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 11 Nov 2015 20:17:37 -0200 Subject: netfilter: nf_ct_sctp: move ip_ct_sctp away from UAPI ip_ct_sctp is an internal structure, embedded by the union nf_conntrack_proto to store sctp-specific information at conntrack entries. It has no business with UAPI. This patch moves it from UAPI to a saner place, together with similar structs for other protocols. Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_sctp.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 include/linux/netfilter/nf_conntrack_sctp.h (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h new file mode 100644 index 000000000000..22a16a23cd8a --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_sctp.h @@ -0,0 +1,13 @@ +#ifndef _NF_CONNTRACK_SCTP_H +#define _NF_CONNTRACK_SCTP_H +/* SCTP tracking. */ + +#include + +struct ip_ct_sctp { + enum sctp_conntrack state; + + __be32 vtag[IP_CT_DIR_MAX]; +}; + +#endif /* _NF_CONNTRACK_SCTP_H */ -- cgit v1.2.3 From 9458ceab49179b7fd2d5192fd9dcf316ca195dc0 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 24 Nov 2015 15:30:21 -0800 Subject: net: phy: bcm7xxx: Add entry for Broadcom BCM7435 Add a PHY entry for the Broadcom BCM7435 chips, this is a 40nm generation Ethernet PHY which is analogous to its 7425 and 7429 counter parts. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 59f4a7304419..f0ba9c2ec639 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -26,6 +26,7 @@ #define PHY_ID_BCM7366 0x600d8490 #define PHY_ID_BCM7425 0x600d86b0 #define PHY_ID_BCM7429 0x600d8730 +#define PHY_ID_BCM7435 0x600d8750 #define PHY_ID_BCM7439 0x600d8480 #define PHY_ID_BCM7439_2 0xae025080 #define PHY_ID_BCM7445 0x600d8510 -- cgit v1.2.3 From 1ce0bf50ae2233c7115a18c0c623662d177b434c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 26 Nov 2015 13:55:39 +0800 Subject: net: Generalise wq_has_sleeper helper The memory barrier in the helper wq_has_sleeper is needed by just about every user of waitqueue_active. This patch generalises it by making it take a wait_queue_head_t directly. The existing helper is renamed to skwq_has_sleeper. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/wait.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 1e1bf9f963a9..6aa09a875fbd 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -107,6 +107,27 @@ static inline int waitqueue_active(wait_queue_head_t *q) return !list_empty(&q->task_list); } +/** + * wq_has_sleeper - check if there are any waiting processes + * @wq: wait queue head + * + * Returns true if wq has waiting processes + * + * Please refer to the comment for waitqueue_active. + */ +static inline bool wq_has_sleeper(wait_queue_head_t *wq) +{ + /* + * We need to be sure we are in sync with the + * add_wait_queue modifications to the wait queue. + * + * This memory barrier should be paired with one on the + * waiting side. + */ + smp_mb(); + return waitqueue_active(wq); +} + extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait); extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); -- cgit v1.2.3 From 06bd6c0370bb88a2256c6763a32bc4e4ade06521 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:45 +0100 Subject: net: ipmr: remove unused MFC_NOTIFY flag and make the flags enum MFC_NOTIFY was introduced in kernel 2.1.68 but afaik it hasn't been used and I couldn't find any users currently so just remove it. Only MFC_STATIC is left, so move it into an enum, add a description and use BIT(). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 79aaa9fc1a15..fa66ebc1fed6 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -64,6 +64,13 @@ struct vif_device { #define VIFF_STATIC 0x8000 +/* mfc_flags: + * MFC_STATIC - the entry was added statically (not by a routing daemon) + */ +enum { + MFC_STATIC = BIT(0), +}; + struct mfc_cache { struct list_head list; __be32 mfc_mcastgrp; /* Group the entry belongs to */ @@ -89,9 +96,6 @@ struct mfc_cache { struct rcu_head rcu; }; -#define MFC_STATIC 1 -#define MFC_NOTIFY 2 - #define MFC_LINES 64 #ifdef __BIG_ENDIAN -- cgit v1.2.3 From 520191bb404c4b7b4cdb70a5480ada974b0c2d60 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:46 +0100 Subject: net: ipmr: adjust mroute.h style and drop extern Remove extra spaces and tabs, adjust function definitions, remove an unnecessary ifdef (already used below, just move code) and drop extern from the functions. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index fa66ebc1fed6..7c567a2679ce 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -9,38 +9,28 @@ #ifdef CONFIG_IP_MROUTE static inline int ip_mroute_opt(int opt) { - return (opt >= MRT_BASE) && (opt <= MRT_MAX); + return opt >= MRT_BASE && opt <= MRT_MAX; } -#else -static inline int ip_mroute_opt(int opt) -{ - return 0; -} -#endif -#ifdef CONFIG_IP_MROUTE -extern int ip_mroute_setsockopt(struct sock *, int, char __user *, unsigned int); -extern int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); -extern int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); -extern int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); -extern int ip_mr_init(void); +int ip_mroute_setsockopt(struct sock *, int, char __user *, unsigned int); +int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); +int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); +int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); +int ip_mr_init(void); #else -static inline -int ip_mroute_setsockopt(struct sock *sock, - int optname, char __user *optval, unsigned int optlen) +static inline int ip_mroute_setsockopt(struct sock *sock, int optname, + char __user *optval, unsigned int optlen) { return -ENOPROTOOPT; } -static inline -int ip_mroute_getsockopt(struct sock *sock, - int optname, char __user *optval, int __user *optlen) +static inline int ip_mroute_getsockopt(struct sock *sock, int optname, + char __user *optval, int __user *optlen) { return -ENOPROTOOPT; } -static inline -int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) +static inline int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) { return -ENOIOCTLCMD; } @@ -49,6 +39,11 @@ static inline int ip_mr_init(void) { return 0; } + +static inline int ip_mroute_opt(int opt) +{ + return 0; +} #endif struct vif_device { @@ -96,16 +91,16 @@ struct mfc_cache { struct rcu_head rcu; }; -#define MFC_LINES 64 +#define MFC_LINES 64 #ifdef __BIG_ENDIAN #define MFC_HASH(a,b) (((((__force u32)(__be32)a)>>24)^(((__force u32)(__be32)b)>>26))&(MFC_LINES-1)) #else #define MFC_HASH(a,b) ((((__force u32)(__be32)a)^(((__force u32)(__be32)b)>>2))&(MFC_LINES-1)) -#endif +#endif struct rtmsg; -extern int ipmr_get_route(struct net *net, struct sk_buff *skb, - __be32 saddr, __be32 daddr, - struct rtmsg *rtm, int nowait); +int ipmr_get_route(struct net *net, struct sk_buff *skb, + __be32 saddr, __be32 daddr, + struct rtmsg *rtm, int nowait); #endif -- cgit v1.2.3 From 5ea1f13299d8b8edcb2969eda4c81f8e3264b706 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:47 +0100 Subject: net: ipmr: move struct mr_table and VIF_EXISTS to mroute.h Move the definitions of VIF_EXISTS() and struct mr_table to mroute.h Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 7c567a2679ce..bf9b322cb0b0 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -59,6 +59,25 @@ struct vif_device { #define VIFF_STATIC 0x8000 +#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL) +#define MFC_LINES 64 + +struct mr_table { + struct list_head list; + possible_net_t net; + u32 id; + struct sock __rcu *mroute_sk; + struct timer_list ipmr_expire_timer; + struct list_head mfc_unres_queue; + struct list_head mfc_cache_array[MFC_LINES]; + struct vif_device vif_table[MAXVIFS]; + int maxvif; + atomic_t cache_resolve_queue_len; + bool mroute_do_assert; + bool mroute_do_pim; + int mroute_reg_vif_num; +}; + /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) */ @@ -91,8 +110,6 @@ struct mfc_cache { struct rcu_head rcu; }; -#define MFC_LINES 64 - #ifdef __BIG_ENDIAN #define MFC_HASH(a,b) (((((__force u32)(__be32)a)>>24)^(((__force u32)(__be32)b)>>26))&(MFC_LINES-1)) #else -- cgit v1.2.3 From 1973a4ea6ceaa47671227c3077f90508ea30897b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 26 Nov 2015 15:23:48 +0100 Subject: net: ipmr: move pimsm_enabled to pim.h and rename Move the inline pimsm_enabled() to pim.h and rename it to ipmr_pimsm_enabled to show it's for the ipv4 ipmr code since pim.h is used by IPv6 too. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/pim.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pim.h b/include/linux/pim.h index 252bf6644c51..e1d756f81348 100644 --- a/include/linux/pim.h +++ b/include/linux/pim.h @@ -13,6 +13,11 @@ #define PIM_NULL_REGISTER cpu_to_be32(0x40000000) +static inline bool ipmr_pimsm_enabled(void) +{ + return IS_BUILTIN(CONFIG_IP_PIMSM_V1) || IS_BUILTIN(CONFIG_IP_PIMSM_V2); +} + /* PIMv2 register message header layout (ietf-draft-idmr-pimvsm-v2-00.ps */ struct pimreghdr { -- cgit v1.2.3 From 91420b83baa046ada1a899c97f3b2c52a9045705 Mon Sep 17 00:00:00 2001 From: Sudarsana Kalluru Date: Mon, 30 Nov 2015 12:25:03 +0200 Subject: qed: Add support for changing LED state Physical LEDs are being controlled by the management FW. This adds the qed functionality required to request management FW to change the LED configuration, as well as the necessary APIs for this functionality to later be used by the protocol drivers. Signed-off-by: Sudarsana Kalluru Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index dc9a1353f971..d4a32e878180 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -25,6 +25,12 @@ #include #include +enum qed_led_mode { + QED_LED_MODE_OFF, + QED_LED_MODE_ON, + QED_LED_MODE_RESTORE +}; + #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \ (void __iomem *)(reg_addr)) @@ -252,6 +258,17 @@ struct qed_common_ops { void (*chain_free)(struct qed_dev *cdev, struct qed_chain *p_chain); + +/** + * @brief set_led - Configure LED mode + * + * @param cdev + * @param mode - LED mode + * + * @return 0 on success, error otherwise. + */ + int (*set_led)(struct qed_dev *cdev, + enum qed_led_mode mode); }; /** -- cgit v1.2.3 From c0eb454034aab783dc602739237a63b30867f5bd Mon Sep 17 00:00:00 2001 From: KY Srinivasan Date: Tue, 1 Dec 2015 16:43:10 -0800 Subject: hv_netvsc: Don't ask for additional head room in the skb The rndis header is 116 bytes big and can be placed in the default head room that will be available in the skb. Since the netvsc packet is less than 48 bytes, we can use the skb control buffer for the netvsc packet. With these changes we don't need to ask for additional head room. Signed-off-by: K. Y. Srinivasan Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7d2d1d7aaec7..fcbc5259c630 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -132,7 +132,9 @@ static inline bool dev_xmit_complete(int rc) * used. */ -#if defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) +#if defined(CONFIG_HYPERV_NET) +# define LL_MAX_HEADER 128 +#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) # if defined(CONFIG_MAC80211_MESH) # define LL_MAX_HEADER 128 # else -- cgit v1.2.3 From c981e4213e9d2d4ec79501bd607722ec712742a2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:06 +0100 Subject: net: add netif_is_team_master helper Similar to other helpers, caller can use this to find out if device is team master. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fcbc5259c630..2b889be65d88 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1273,6 +1273,7 @@ struct net_device_ops { * @IFF_NO_QUEUE: device can run without qdisc attached * @IFF_OPENVSWITCH: device is a Open vSwitch master * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device + * @IFF_TEAM: device is a team device */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1299,6 +1300,7 @@ enum netdev_priv_flags { IFF_NO_QUEUE = 1<<21, IFF_OPENVSWITCH = 1<<22, IFF_L3MDEV_SLAVE = 1<<23, + IFF_TEAM = 1<<24, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1325,6 +1327,7 @@ enum netdev_priv_flags { #define IFF_NO_QUEUE IFF_NO_QUEUE #define IFF_OPENVSWITCH IFF_OPENVSWITCH #define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE +#define IFF_TEAM IFF_TEAM /** * struct net_device - The DEVICE structure. @@ -3889,6 +3892,11 @@ static inline bool netif_is_ovs_master(const struct net_device *dev) return dev->priv_flags & IFF_OPENVSWITCH; } +static inline bool netif_is_team_master(struct net_device *dev) +{ + return dev->priv_flags & IFF_TEAM; +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From f7f019ee6d117de5007d0b10e7960696bbf111eb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:07 +0100 Subject: net: add netif_is_team_port helper Similar to other helpers, caller can use this to find out if device is team port. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2b889be65d88..b3601f8a9b42 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3897,6 +3897,11 @@ static inline bool netif_is_team_master(struct net_device *dev) return dev->priv_flags & IFF_TEAM; } +static inline bool netif_is_team_port(struct net_device *dev) +{ + return dev->priv_flags & IFF_TEAM_PORT; +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From 7be61833042e7757745345eedc7b0efee240c189 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:08 +0100 Subject: net: add netif_is_lag_master helper Some code does not mind if the master is bond or team and treats them the same, as generic LAG. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b3601f8a9b42..3ca083efa560 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3902,6 +3902,11 @@ static inline bool netif_is_team_port(struct net_device *dev) return dev->priv_flags & IFF_TEAM_PORT; } +static inline bool netif_is_lag_master(struct net_device *dev) +{ + return netif_is_bond_master(dev) || netif_is_team_master(dev); +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From e0ba1414f310c83bf425fe26fa2cd5f1befcd6dc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:09 +0100 Subject: net: add netif_is_lag_port helper Some code does not mind if a device is bond slave or team port and treats them the same, as generic LAG ports. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ca083efa560..1506be58c59a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3907,6 +3907,11 @@ static inline bool netif_is_lag_master(struct net_device *dev) return netif_is_bond_master(dev) || netif_is_team_master(dev); } +static inline bool netif_is_lag_port(struct net_device *dev) +{ + return netif_is_bond_slave(dev) || netif_is_team_port(dev); +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From 6dffb0447c25476f499d205dfceb1972e8dae919 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:10 +0100 Subject: net: propagate upper priv via netdev_master_upper_dev_link Eliminate netdev_master_upper_dev_link_private and pass priv directly as a parameter of netdev_master_upper_dev_link. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1506be58c59a..939b8f3de810 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3619,10 +3619,8 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev); struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); int netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev); -int netdev_master_upper_dev_link_private(struct net_device *dev, - struct net_device *upper_dev, - void *private); + struct net_device *upper_dev, + void *upper_priv); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); -- cgit v1.2.3 From 29bf24afb29042f568fa67b1b0eee46796725ed2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:11 +0100 Subject: net: add possibility to pass information about upper device via notifier Sometimes the drivers and other code would find it handy to know some internal information about upper device being changed. So allow upper-code to pass information down to notifier listeners during linking. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 939b8f3de810..aea556c64f2c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2163,6 +2163,7 @@ struct netdev_notifier_changeupper_info { struct net_device *upper_dev; /* new upper dev */ bool master; /* is upper dev master */ bool linking; /* is the nofication for link or unlink */ + void *upper_info; /* upper dev info */ }; static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, @@ -3620,7 +3621,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv); + void *upper_priv, void *upper_info); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); -- cgit v1.2.3 From 764f5e544118508add420724789f46e04dba91eb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:12 +0100 Subject: net: add info struct for LAG changeupper This struct will be shared by bonding and team to pass internal information to notifier listeners. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index aea556c64f2c..3ab90ea0ed03 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2110,6 +2110,19 @@ struct pcpu_sw_netstats { #define netdev_alloc_pcpu_stats(type) \ __netdev_alloc_pcpu_stats(type, GFP_KERNEL); +enum netdev_lag_tx_type { + NETDEV_LAG_TX_TYPE_UNKNOWN, + NETDEV_LAG_TX_TYPE_RANDOM, + NETDEV_LAG_TX_TYPE_BROADCAST, + NETDEV_LAG_TX_TYPE_ROUNDROBIN, + NETDEV_LAG_TX_TYPE_ACTIVEBACKUP, + NETDEV_LAG_TX_TYPE_HASH, +}; + +struct netdev_lag_upper_info { + enum netdev_lag_tx_type tx_type; +}; + #include /* netdevice notifier chain. Please remember to update the rtnetlink -- cgit v1.2.3 From 8fd728566a354f7bc9cb6e781f185b8c39cf505b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:13 +0100 Subject: team: fill-up LAG changeupper info struct and pass it along Initialize netdev_lag_upper_info structure by TX type according to current team mode and pass it along via netdev_master_upper_dev_link. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_team.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index a6aa970758a2..b84e49c3a738 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -164,6 +164,7 @@ struct team_mode { size_t priv_size; size_t port_priv_size; const struct team_mode_ops *ops; + enum netdev_lag_tx_type lag_tx_type; }; #define TEAM_PORT_HASHBITS 4 -- cgit v1.2.3 From 04d482660a07039fc4e9a42bb3517db236d98f96 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:15 +0100 Subject: net: introduce change lower state notifier When lower device like bonding slave, team/bridge port, etc changes its state, it is useful for others to notice this change. Currently this is implemented specificly for bonding as NETDEV_BONDING_INFO notifier. This patch aims to replace this specific usage and make this more generic to be used for all upper-lower devices. Introduce NETDEV_CHANGELOWERSTATE netdev notifier type and netdev_lower_state_changed() helper. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ab90ea0ed03..ad69f237aa78 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2158,6 +2158,7 @@ struct netdev_lag_upper_info { #define NETDEV_CHANGEINFODATA 0x0018 #define NETDEV_BONDING_INFO 0x0019 #define NETDEV_PRECHANGEUPPER 0x001A +#define NETDEV_CHANGELOWERSTATE 0x001B int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); @@ -2179,6 +2180,11 @@ struct netdev_notifier_changeupper_info { void *upper_info; /* upper dev info */ }; +struct netdev_notifier_changelowerstate_info { + struct netdev_notifier_info info; /* must be first */ + void *lower_state_info; /* is lower dev state */ +}; + static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { @@ -3640,6 +3646,8 @@ void netdev_upper_dev_unlink(struct net_device *dev, void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); +void netdev_lower_state_changed(struct net_device *lower_dev, + void *lower_state_info); /* RSS keys are 40 or 52 bytes long */ #define NETDEV_RSS_KEY_LEN 52 -- cgit v1.2.3 From fb1b2e3ce53aef80b3cef71f3885d584cdbdc6b8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Dec 2015 12:12:16 +0100 Subject: net: introduce lower state changed info structure for LAG lowers This is shared info structure for bonding and team. Serves to pass down info about link state and port activity to notification listeners. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ad69f237aa78..fa84b59eb197 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2123,6 +2123,11 @@ struct netdev_lag_upper_info { enum netdev_lag_tx_type tx_type; }; +struct netdev_lag_lower_state_info { + u8 link_up : 1, + tx_enabled : 1; +}; + #include /* netdevice notifier chain. Please remember to update the rtnetlink -- cgit v1.2.3 From fc50db98ff872372f266695858f87a12eb1b4f05 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 1 Dec 2015 18:03:09 +0200 Subject: net/mlx5_core: Add base sriov support This patch adds SRIOV base support for mlx5 supported devices. The same driver is used for both PFs and VFs; VFs are identified by the driver through the flag MLX5_PCI_DEV_IS_VF added to the pci table entries. Virtual functions are created as usual through writing a value to the sriov_numvs sysfs file of the PF device. Upon instantiating VFs, they will all be probed by the driver on the hypervisor. One can gracefully unbind them through /sys/bus/pci/drivers/mlx5_core/unbind. mlx5_wait_for_vf_pages() was added to ensure that when a VF dies without executing proper teardown, the hypervisor driver waits till all of the pages that were allocated at the hypervisor to maintain its operation are returned. In order for the VF to be operational, the PF needs to call enable_hca for it. This can be done before the VFs are created through a call to pci_enable_sriov. If the there are VFs assigned to a VMs when the driver of the PF is unloaded, all the VF will experience system error and PF driver unloads cleanly; in this case pci_disable_sriov is not called and the devices will show when running lspci. Once the PF driver is reloaded, it will sync its data structures which maintain state on its VFs. Signed-off-by: Eli Cohen Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 24 ++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5c857f2a20d7..efebb87163c8 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -426,6 +426,16 @@ struct mlx5_mr_table { struct radix_tree_root tree; }; +struct mlx5_vf_context { + int enabled; +}; + +struct mlx5_core_sriov { + struct mlx5_vf_context *vfs_ctx; + int num_vfs; + int enabled_vfs; +}; + struct mlx5_irq_info { cpumask_var_t mask; char name[MLX5_MAX_IRQ_NAME]; @@ -447,6 +457,7 @@ struct mlx5_priv { int fw_pages; atomic_t reg_pages; struct list_head free_list; + int vfs_pages; struct mlx5_core_health health; @@ -485,6 +496,8 @@ struct mlx5_priv { struct list_head dev_list; struct list_head ctx_list; spinlock_t ctx_lock; + struct mlx5_core_sriov sriov; + unsigned long pci_dev_data; }; enum mlx5_device_state { @@ -739,6 +752,8 @@ void mlx5_pagealloc_init(struct mlx5_core_dev *dev); void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev); int mlx5_pagealloc_start(struct mlx5_core_dev *dev); void mlx5_pagealloc_stop(struct mlx5_core_dev *dev); +int mlx5_sriov_init(struct mlx5_core_dev *dev); +int mlx5_sriov_cleanup(struct mlx5_core_dev *dev); void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id, s32 npages); int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot); @@ -884,6 +899,15 @@ struct mlx5_profile { } mr_cache[MAX_MR_CACHE_ENTRIES]; }; +enum { + MLX5_PCI_DEV_IS_VF = 1 << 0, +}; + +static inline int mlx5_core_is_pf(struct mlx5_core_dev *dev) +{ + return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF); +} + static inline int mlx5_get_gid_table_len(u16 param) { if (param > 4) { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1565324eb620..9b76fddd696b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -665,7 +665,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_17[0x1]; u8 ets[0x1]; u8 nic_flow_table[0x1]; - u8 reserved_18[0x4]; + u8 reserved_18_0; + u8 early_vf_enable; + u8 reserved_18[0x2]; u8 local_ca_ack_delay[0x5]; u8 reserved_19[0x6]; u8 port_type[0x2]; -- cgit v1.2.3 From 54f0a411ec72cb437d57d0c9654dcbd0f198ff3a Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:10 +0200 Subject: net/mlx5: Add HW capabilities and structs for SR-IOV E-Switch Update HCA capabilities and HW struct to include needed capabilities for upcoming Ethernet Switch (SR-IOV E-Switch). Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 9b76fddd696b..836cf0e43174 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -665,7 +665,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_17[0x1]; u8 ets[0x1]; u8 nic_flow_table[0x1]; - u8 reserved_18_0; + u8 eswitch_flow_table[0x1]; u8 early_vf_enable; u8 reserved_18[0x2]; u8 local_ca_ack_delay[0x5]; @@ -789,22 +789,30 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_60[0x1b]; u8 log_max_wq_sz[0x5]; - u8 reserved_61[0xa0]; - + u8 nic_vport_change_event[0x1]; + u8 reserved_61[0xa]; + u8 log_max_vlan_list[0x5]; u8 reserved_62[0x3]; + u8 log_max_current_mc_list[0x5]; + u8 reserved_63[0x3]; + u8 log_max_current_uc_list[0x5]; + + u8 reserved_64[0x80]; + + u8 reserved_65[0x3]; u8 log_max_l2_table[0x5]; - u8 reserved_63[0x8]; + u8 reserved_66[0x8]; u8 log_uar_page_sz[0x10]; - u8 reserved_64[0x100]; + u8 reserved_67[0xe0]; - u8 reserved_65[0x1f]; + u8 reserved_68[0x1f]; u8 cqe_zip[0x1]; u8 cqe_zip_timeout[0x10]; u8 cqe_zip_max_num[0x10]; - u8 reserved_66[0x220]; + u8 reserved_69[0x220]; }; enum { @@ -2135,10 +2143,6 @@ struct mlx5_ifc_rmpc_bits { struct mlx5_ifc_wq_bits wq; }; -enum { - MLX5_NIC_VPORT_CONTEXT_ALLOWED_LIST_TYPE_CURRENT_UC_MAC_ADDRESS = 0x0, -}; - struct mlx5_ifc_nic_vport_context_bits { u8 reserved_0[0x1f]; u8 roce_en[0x1]; -- cgit v1.2.3 From e1d7d349c69d12721c420f1fe673ce9aa462aadd Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:11 +0200 Subject: net/mlx5: Update access functions to Query/Modify vport MAC address In preparation for SR-IOV we add here an API to enable each e-switch client (PF/VF) to configure its L2 MAC addresses and for the e-switch manager (usually the PF) to access them in order to be able to configure them into the e-switch. Therefore we now pass vport num parameter to mlx5_query_nic_vport_context, so PF can access other vports contexts. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/vport.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 967e0fd06e89..43e82d9f5463 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -36,7 +36,10 @@ #include u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); -void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr); +int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, + u16 vport, u8 *addr); +int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, + u16 vport, u8 *addr); int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport, u8 port_num, u16 vf_num, u16 gid_index, union ib_gid *gid); -- cgit v1.2.3 From e16aea2744abea612c27ee0eef606c6a6a8204de Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:12 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport mac lists Those functions are needed to notify the upcoming L2 table and SR-IOV E-Switch(FDB) manager(PF), of the NIC vport (vf) UC/MC mac lists changes. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 6 ++++++ include/linux/mlx5/vport.h | 10 ++++++++++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0b473cbfa7ef..0d2f0435a9f0 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1102,6 +1102,12 @@ enum { MLX5_FLOW_CONTEXT_DEST_TYPE_TIR = 2, }; +enum mlx5_list_type { + MLX5_NVPRT_LIST_TYPE_UC = 0x0, + MLX5_NVPRT_LIST_TYPE_MC = 0x1, + MLX5_NVPRT_LIST_TYPE_VLAN = 0x2, +}; + enum { MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE = 0x0, MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM = 0x1, diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 43e82d9f5463..00bbec8d9527 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -34,6 +34,7 @@ #define __MLX5_VPORT_H__ #include +#include u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, @@ -54,5 +55,14 @@ int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev, u64 *sys_image_guid); int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev, u64 *node_guid); +int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev, + u32 vport, + enum mlx5_list_type list_type, + u8 addr_list[][ETH_ALEN], + int *list_size); +int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, + enum mlx5_list_type list_type, + u8 addr_list[][ETH_ALEN], + int list_size); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From e75465148b7df7f2796c75bf98bf33f171edeb2b Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:13 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport state In preparation for SR-IOV we add here an API to enable each e-switch manager (PF) to configure its VFs link states in e-switch preparation for ethernet sriov. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 1 + include/linux/mlx5/vport.h | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 836cf0e43174..655184702ea2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2946,6 +2946,7 @@ struct mlx5_ifc_query_vport_state_out_bits { enum { MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT = 0x0, + MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT = 0x1, }; struct mlx5_ifc_query_vport_state_in_bits { diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 00bbec8d9527..c1bba5948851 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -36,7 +36,11 @@ #include #include -u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod); +u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); +u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport); +int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, + u16 vport, u8 state); int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, u8 *addr); int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, -- cgit v1.2.3 From d82b73186dab70d6d332dd2afdb48608be2e5230 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:14 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport promisc mode Those functions are needed to notify the upcoming SR-IOV E-Switch(FDB) manager(PF), of the NIC vport (vf) promisc mode changes. Preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 28 +++++++++++++++++++++++----- include/linux/mlx5/vport.h | 9 +++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 655184702ea2..2728b5f6c017 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2147,16 +2147,31 @@ struct mlx5_ifc_nic_vport_context_bits { u8 reserved_0[0x1f]; u8 roce_en[0x1]; - u8 reserved_1[0x760]; + u8 arm_change_event[0x1]; + u8 reserved_1[0x1a]; + u8 event_on_mtu[0x1]; + u8 event_on_promisc_change[0x1]; + u8 event_on_vlan_change[0x1]; + u8 event_on_mc_address_change[0x1]; + u8 event_on_uc_address_change[0x1]; - u8 reserved_2[0x5]; + u8 reserved_2[0xf0]; + + u8 mtu[0x10]; + + u8 reserved_3[0x640]; + + u8 promisc_uc[0x1]; + u8 promisc_mc[0x1]; + u8 promisc_all[0x1]; + u8 reserved_4[0x2]; u8 allowed_list_type[0x3]; - u8 reserved_3[0xc]; + u8 reserved_5[0xc]; u8 allowed_list_size[0xc]; struct mlx5_ifc_mac_address_layout_bits permanent_address; - u8 reserved_4[0x20]; + u8 reserved_6[0x20]; u8 current_uc_mac_address[0][0x40]; }; @@ -4235,7 +4250,10 @@ struct mlx5_ifc_modify_nic_vport_context_out_bits { }; struct mlx5_ifc_modify_nic_vport_field_select_bits { - u8 reserved_0[0x1c]; + u8 reserved_0[0x19]; + u8 mtu[0x1]; + u8 change_event[0x1]; + u8 promisc[0x1]; u8 permanent_address[0x1]; u8 addresses_list[0x1]; u8 roce_en[0x1]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index c1bba5948851..dbbaed9f975a 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -68,5 +68,14 @@ int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev, enum mlx5_list_type list_type, u8 addr_list[][ETH_ALEN], int list_size); +int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev, + u32 vport, + int *promisc_uc, + int *promisc_mc, + int *promisc_all); +int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, + int promisc_uc, + int promisc_mc, + int promisc_all); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From c0046cf7b81ac55b8bf056c71918ec04edd99379 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:15 +0200 Subject: net/mlx5: Introduce access functions to modify/query vport vlans Those functions are needed to notify the upcoming L2 table and SR-IOV E-Switch(FDB) manager(PF), of the NIC vport (vf) vlan table changes. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 7 +++++++ include/linux/mlx5/vport.h | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2728b5f6c017..39487d0c305d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -910,6 +910,13 @@ struct mlx5_ifc_mac_address_layout_bits { u8 mac_addr_31_0[0x20]; }; +struct mlx5_ifc_vlan_layout_bits { + u8 reserved_0[0x14]; + u8 vlan[0x0c]; + + u8 reserved_1[0x20]; +}; + struct mlx5_ifc_cong_control_r_roce_ecn_np_bits { u8 reserved_0[0xa0]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index dbbaed9f975a..638f2ca7a527 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -77,5 +77,12 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, int promisc_uc, int promisc_mc, int promisc_all); +int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev, + u32 vport, + u16 vlans[], + int *size); +int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, + u16 vlans[], + int list_size); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From 073bb189a41d7bbad509b576a690611c46c4858f Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:18 +0200 Subject: net/mlx5: Introducing E-Switch and l2 table E-Switch is the software entity that represents and manages ConnectX4 inter-HCA ethernet l2 switching. E-Switch has its own Virtual Ports, each Vport/vNIC/VF can be connected to the device through a vport of an e-switch. Each e-switch is managed by one vNIC identified by HCA_CAP.vport_group_manager (usually it is the PF/vport[0]), and its main responsibility is to forward each packet to the right vport. e-Switch needs to manage its own l2-table and FDB tables. L2 table is a flow table that is managed by FW, it is needed for Multi-host (Multi PF) configuration for inter HCA switching between PFs. FDB table is a flow table that is totally managed by e-Switch driver, its main responsibility is to switch packets between e-Swtich internal vports and uplink vport that belong to the same. This patch introduces only e-Swtich l2 table management, FDB managemnt will come later when ethernet SRIOV/VFs will be enabled. preperation for ethernet sriov and l2 table management. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 8 ++++++++ include/linux/mlx5/driver.h | 4 ++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0d2f0435a9f0..90a4cb6dc4cb 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -251,6 +251,7 @@ enum mlx5_event { MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, MLX5_EVENT_TYPE_PAGE_FAULT = 0xc, + MLX5_EVENT_TYPE_NIC_VPORT_CHANGE = 0xd, }; enum { @@ -520,6 +521,12 @@ struct mlx5_eqe_page_fault { __be32 flags_qpn; } __packed; +struct mlx5_eqe_vport_change { + u8 rsvd0[2]; + __be16 vport_num; + __be32 rsvd1[6]; +} __packed; + union ev_data { __be32 raw[7]; struct mlx5_eqe_cmd cmd; @@ -532,6 +539,7 @@ union ev_data { struct mlx5_eqe_stall_vl stall_vl; struct mlx5_eqe_page_req req_pages; struct mlx5_eqe_page_fault page_fault; + struct mlx5_eqe_vport_change vport_change; } __packed; struct mlx5_eqe { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index efebb87163c8..ac098b6b97bf 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -441,6 +441,8 @@ struct mlx5_irq_info { char name[MLX5_MAX_IRQ_NAME]; }; +struct mlx5_eswitch; + struct mlx5_priv { char name[MLX5_MAX_NAME_LEN]; struct mlx5_eq_table eq_table; @@ -496,6 +498,8 @@ struct mlx5_priv { struct list_head dev_list; struct list_head ctx_list; spinlock_t ctx_lock; + + struct mlx5_eswitch *eswitch; struct mlx5_core_sriov sriov; unsigned long pci_dev_data; }; -- cgit v1.2.3 From 495716b191f607b2cb2175f7499966daef79f663 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:19 +0200 Subject: net/mlx5: E-Switch, Introduce FDB hardware capabilities Define needed hardware structures and capabilities needed for E-Switch FDB flow tables and read them on driver load. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 15 +++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 13 +++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 90a4cb6dc4cb..bce9caed1eed 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1138,6 +1138,7 @@ enum mlx5_cap_type { MLX5_CAP_IPOIB_OFFLOADS, MLX5_CAP_EOIB_OFFLOADS, MLX5_CAP_FLOW_TABLE, + MLX5_CAP_ESWITCH_FLOW_TABLE, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1175,6 +1176,20 @@ enum mlx5_cap_type { #define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \ MLX5_GET(flow_table_nic_cap, mdev->hca_caps_max[MLX5_CAP_FLOW_TABLE], cap) +#define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \ + MLX5_GET(flow_table_eswitch_cap, \ + mdev->hca_caps_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) + +#define MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, cap) \ + MLX5_GET(flow_table_eswitch_cap, \ + mdev->hca_caps_max[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) + +#define MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) \ + MLX5_CAP_ESW_FLOWTABLE(mdev, flow_table_properties_nic_esw_fdb.cap) + +#define MLX5_CAP_ESW_FLOWTABLE_FDB_MAX(mdev, cap) \ + MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_nic_esw_fdb.cap) + #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 39487d0c305d..ae7c08adba4a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -447,6 +447,18 @@ struct mlx5_ifc_flow_table_nic_cap_bits { u8 reserved_3[0x7200]; }; +struct mlx5_ifc_flow_table_eswitch_cap_bits { + u8 reserved_0[0x200]; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_ingress; + + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress; + + u8 reserved_1[0x7800]; +}; + struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 csum_cap[0x1]; u8 vlan_cap[0x1]; @@ -1846,6 +1858,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_roce_cap_bits roce_cap; struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps; struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; + struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; u8 reserved_0[0x8000]; }; -- cgit v1.2.3 From 81848731ff4070a3e4136efa6a99d507177a53fe Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:20 +0200 Subject: net/mlx5: E-Switch, Add SR-IOV (FDB) support Enabling E-Switch SRIOV for nvfs+1 vports. Create E-Switch FDB for L2 UC/MC mac steering between VFs/PF and external vport (Uplink). FDB contains forwarding rules such as: UC MAC0 -> vport0(PF). UC MAC1 -> vport1. UC MAC2 -> vport2. MC MACX -> vport0, vport2, Uplink. MC MACY -> vport1, Uplink. For unmatched traffic FDB has the following default rules: Unmached Traffic (src vport != Uplink) -> Uplink. Unmached Traffic (src vport == Uplink) -> vport0(PF). FDB rules population: Each NIC vport (VF) will notify E-Switch manager of its UC/MC vport context changes via modify vport context command, which will be translated to an event that will be handled by E-Switch manager (PF) which will update FDB table accordingly. Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 6 ++++++ include/linux/mlx5/flow_table.h | 9 +++++++++ include/linux/mlx5/mlx5_ifc.h | 7 ++++--- 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index bce9caed1eed..88eb4490a8b3 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1074,6 +1074,12 @@ enum { VPORT_STATE_UP = 0x1, }; +enum { + MLX5_ESW_VPORT_ADMIN_STATE_DOWN = 0x0, + MLX5_ESW_VPORT_ADMIN_STATE_UP = 0x1, + MLX5_ESW_VPORT_ADMIN_STATE_AUTO = 0x2, +}; + enum { MLX5_L3_PROT_TYPE_IPV4 = 0, MLX5_L3_PROT_TYPE_IPV6 = 1, diff --git a/include/linux/mlx5/flow_table.h b/include/linux/mlx5/flow_table.h index 5f922c6d4fc2..0f2a15cf3317 100644 --- a/include/linux/mlx5/flow_table.h +++ b/include/linux/mlx5/flow_table.h @@ -41,6 +41,15 @@ struct mlx5_flow_table_group { u32 match_criteria[MLX5_ST_SZ_DW(fte_match_param)]; }; +struct mlx5_flow_destination { + enum mlx5_flow_destination_type type; + union { + u32 tir_num; + void *ft; + u32 vport_num; + }; +}; + void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type, u16 num_groups, struct mlx5_flow_table_group *group); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ae7c08adba4a..a81b008738fd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -827,9 +827,10 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_69[0x220]; }; -enum { - MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_FLOW_TABLE_ = 0x1, - MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_TIR = 0x2, +enum mlx5_flow_destination_type { + MLX5_FLOW_DESTINATION_TYPE_VPORT = 0x0, + MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, + MLX5_FLOW_DESTINATION_TYPE_TIR = 0x2, }; struct mlx5_ifc_dest_format_struct_bits { -- cgit v1.2.3 From d6666753c6e85834f1669c7b831cc2b7fc9e4390 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 1 Dec 2015 18:03:22 +0200 Subject: net/mlx5: E-Switch, Introduce HCA cap and E-Switch vport context E-Switch vport context is unlike NIC vport context, managed by the E-Switch manager or vport_group_manager and not by the NIC(VF) driver. The E-Switch manager can access (read/modify) any of its vports E-Switch context. Currently E-Switch vport context includes only clietnt and server vlan insertion and striping data (for later support of VST mode). Signed-off-by: Saeed Mahameed Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 9 +++++ include/linux/mlx5/mlx5_ifc.h | 90 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 88eb4490a8b3..7d3a85faefb7 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1145,6 +1145,7 @@ enum mlx5_cap_type { MLX5_CAP_EOIB_OFFLOADS, MLX5_CAP_FLOW_TABLE, MLX5_CAP_ESWITCH_FLOW_TABLE, + MLX5_CAP_ESWITCH, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1196,6 +1197,14 @@ enum mlx5_cap_type { #define MLX5_CAP_ESW_FLOWTABLE_FDB_MAX(mdev, cap) \ MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_nic_esw_fdb.cap) +#define MLX5_CAP_ESW(mdev, cap) \ + MLX5_GET(e_switch_cap, \ + mdev->hca_caps_cur[MLX5_CAP_ESWITCH], cap) + +#define MLX5_CAP_ESW_MAX(mdev, cap) \ + MLX5_GET(e_switch_cap, \ + mdev->hca_caps_max[MLX5_CAP_ESWITCH], cap) + #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a81b008738fd..f5d94495758a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -459,6 +459,17 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits { u8 reserved_1[0x7800]; }; +struct mlx5_ifc_e_switch_cap_bits { + u8 vport_svlan_strip[0x1]; + u8 vport_cvlan_strip[0x1]; + u8 vport_svlan_insert[0x1]; + u8 vport_cvlan_insert_if_not_exist[0x1]; + u8 vport_cvlan_insert_overwrite[0x1]; + u8 reserved_0[0x1b]; + + u8 reserved_1[0x7e0]; +}; + struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 csum_cap[0x1]; u8 vlan_cap[0x1]; @@ -1860,6 +1871,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps; struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; + struct mlx5_ifc_e_switch_cap_bits e_switch_cap; u8 reserved_0[0x8000]; }; @@ -2305,6 +2317,26 @@ struct mlx5_ifc_hca_vport_context_bits { u8 reserved_6[0xca0]; }; +struct mlx5_ifc_esw_vport_context_bits { + u8 reserved_0[0x3]; + u8 vport_svlan_strip[0x1]; + u8 vport_cvlan_strip[0x1]; + u8 vport_svlan_insert[0x1]; + u8 vport_cvlan_insert[0x2]; + u8 reserved_1[0x18]; + + u8 reserved_2[0x20]; + + u8 svlan_cfi[0x1]; + u8 svlan_pcp[0x3]; + u8 svlan_id[0xc]; + u8 cvlan_cfi[0x1]; + u8 cvlan_pcp[0x3]; + u8 cvlan_id[0xc]; + + u8 reserved_3[0x7a0]; +}; + enum { MLX5_EQC_STATUS_OK = 0x0, MLX5_EQC_STATUS_EQ_WRITE_FAILURE = 0xa, @@ -3743,6 +3775,64 @@ struct mlx5_ifc_query_flow_group_in_bits { u8 reserved_5[0x120]; }; +struct mlx5_ifc_query_esw_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; + + struct mlx5_ifc_esw_vport_context_bits esw_vport_context; +}; + +struct mlx5_ifc_query_esw_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + u8 reserved_3[0x20]; +}; + +struct mlx5_ifc_modify_esw_vport_context_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_esw_vport_context_fields_select_bits { + u8 reserved[0x1c]; + u8 vport_cvlan_insert[0x1]; + u8 vport_svlan_insert[0x1]; + u8 vport_cvlan_strip[0x1]; + u8 vport_svlan_strip[0x1]; +}; + +struct mlx5_ifc_modify_esw_vport_context_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_2[0xf]; + u8 vport_number[0x10]; + + struct mlx5_ifc_esw_vport_context_fields_select_bits field_select; + + struct mlx5_ifc_esw_vport_context_bits esw_vport_context; +}; + struct mlx5_ifc_query_eq_out_bits { u8 status[0x8]; u8 reserved_0[0x18]; -- cgit v1.2.3 From 2d1e0254ef8310e4f0756130a7ffc007ad1d58df Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 1 Dec 2015 14:55:21 +0000 Subject: pci_ids: add Netronome Systems vendor Add PCI vendor id for Netronome Systems. Signed-off-by: Jakub Kicinski Signed-off-by: Rolf Neugebauer Signed-off-by: David S. Miller --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d9ba49cedc5d..1acbefc4bbda 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2495,6 +2495,8 @@ #define PCI_DEVICE_ID_KORENIX_JETCARDF2 0x1700 #define PCI_DEVICE_ID_KORENIX_JETCARDF3 0x17ff +#define PCI_VENDOR_ID_NETRONOME 0x19ee + #define PCI_VENDOR_ID_QMI 0x1a32 #define PCI_VENDOR_ID_AZWAVE 0x1a3b -- cgit v1.2.3 From 80a19e338d458abb5a700df3fd00795c51361f06 Mon Sep 17 00:00:00 2001 From: Asias He Date: Wed, 2 Dec 2015 14:44:00 +0800 Subject: VSOCK: Introduce virtio-vsock-common.ko This module contains the common code and header files for the following virtio-vsock and virtio-vhost kernel modules. Signed-off-by: Asias He Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 209 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 include/linux/virtio_vsock.h (limited to 'include/linux') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h new file mode 100644 index 000000000000..a5f3ecc038f7 --- /dev/null +++ b/include/linux/virtio_vsock.h @@ -0,0 +1,209 @@ +/* + * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so + * anyone can use the definitions to implement compatible drivers/servers: + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) Red Hat, Inc., 2013-2015 + * Copyright (C) Asias He , 2013 + * Copyright (C) Stefan Hajnoczi , 2015 + */ + +#ifndef _LINUX_VIRTIO_VSOCK_H +#define _LINUX_VIRTIO_VSOCK_H + +#include +#include +#include + +#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 +#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) +#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) +#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) +#define VIRTIO_VSOCK_MAX_TX_BUF_SIZE (1024 * 1024 * 16) +#define VIRTIO_VSOCK_MAX_DGRAM_SIZE (1024 * 64) + +struct vsock_transport_recv_notify_data; +struct vsock_transport_send_notify_data; +struct sockaddr_vm; +struct vsock_sock; + +enum { + VSOCK_VQ_CTRL = 0, + VSOCK_VQ_RX = 1, /* for host to guest data */ + VSOCK_VQ_TX = 2, /* for guest to host data */ + VSOCK_VQ_MAX = 3, +}; + +/* virtio transport socket state */ +struct virtio_transport { + struct virtio_transport_pkt_ops *ops; + struct vsock_sock *vsk; + + u32 buf_size; + u32 buf_size_min; + u32 buf_size_max; + + struct mutex tx_lock; + struct mutex rx_lock; + + struct list_head rx_queue; + u32 rx_bytes; + + /* Protected by trans->tx_lock */ + u32 tx_cnt; + u32 buf_alloc; + u32 peer_fwd_cnt; + u32 peer_buf_alloc; + /* Protected by trans->rx_lock */ + u32 fwd_cnt; + + /* Protected by sk_lock */ + u16 dgram_id; + struct list_head incomplete_dgrams; /* dgram fragments */ +}; + +struct virtio_vsock_pkt { + struct virtio_vsock_hdr hdr; + struct virtio_transport *trans; + struct work_struct work; + struct list_head list; + void *buf; + u32 len; + u32 off; +}; + +struct virtio_vsock_pkt_info { + u32 remote_cid, remote_port; + struct msghdr *msg; + u32 pkt_len; + u16 type; + u16 op; + u32 flags; + u16 dgram_id; + u16 dgram_len; +}; + +struct virtio_transport_pkt_ops { + int (*send_pkt)(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info); +}; + +void virtio_vsock_dumppkt(const char *func, + const struct virtio_vsock_pkt *pkt); + +struct sock * +virtio_transport_get_pending(struct sock *listener, + struct virtio_vsock_pkt *pkt); +struct virtio_vsock_pkt * +virtio_transport_alloc_pkt(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info, + size_t len, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port); +ssize_t +virtio_transport_stream_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, + int type); +int +virtio_transport_dgram_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags); + +s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); +s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); + +int virtio_transport_do_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk); +u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); +u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); +void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); +void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); +int +virtio_transport_notify_poll_in(struct vsock_sock *vsk, + size_t target, + bool *data_ready_now); +int +virtio_transport_notify_poll_out(struct vsock_sock *vsk, + size_t target, + bool *space_available_now); + +int virtio_transport_notify_recv_init(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, + size_t target, ssize_t copied, bool data_read, + struct vsock_transport_recv_notify_data *data); +int virtio_transport_notify_send_init(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data); +int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, + ssize_t written, struct vsock_transport_send_notify_data *data); + +u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); +bool virtio_transport_stream_is_active(struct vsock_sock *vsk); +bool virtio_transport_stream_allow(u32 cid, u32 port); +int virtio_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr); +bool virtio_transport_dgram_allow(u32 cid, u32 port); + +int virtio_transport_connect(struct vsock_sock *vsk); + +int virtio_transport_shutdown(struct vsock_sock *vsk, int mode); + +void virtio_transport_release(struct vsock_sock *vsk); + +ssize_t +virtio_transport_stream_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len); +int +virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct msghdr *msg, + size_t len); + +void virtio_transport_destruct(struct vsock_sock *vsk); + +void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt); +u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 wanted); +void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit); +#endif /* _LINUX_VIRTIO_VSOCK_H */ -- cgit v1.2.3 From 2f8364a291e8adde25c93f97a76abbcaf4b1ed3f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 3 Dec 2015 21:12:31 +0100 Subject: WAN: HDLC: Call notifiers before and after changing device type An HDLC device can change type when the protocol driver is changed. Calling the notifier change allows potential users of the interface know about this planned change, and even block it. After the change has occurred, send a second notification to users can evaluate the new device type etc. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/hdlc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h index 1acb1445e05f..e31bcd4c7859 100644 --- a/include/linux/hdlc.h +++ b/include/linux/hdlc.h @@ -101,7 +101,7 @@ netdev_tx_t hdlc_start_xmit(struct sk_buff *skb, struct net_device *dev); int attach_hdlc_protocol(struct net_device *dev, struct hdlc_proto *proto, size_t size); /* May be used by hardware driver to gain control over HDLC device */ -void detach_hdlc_protocol(struct net_device *dev); +int detach_hdlc_protocol(struct net_device *dev); static __inline__ __be16 hdlc_type_trans(struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3 From b618aaa91b5870e7bd139987ac4b7bf0851142d0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 4 Dec 2015 15:01:31 +0100 Subject: net: constify netif_is_* helpers net_device param As suggested by Eric, these helpers should have const dev param. Suggested-by: Eric Dumazet Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 2 +- include/linux/netdevice.h | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 67ce5bd3b56a..05f5879821b8 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -73,7 +73,7 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) /* found in socket.c */ extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *)); -static inline bool is_vlan_dev(struct net_device *dev) +static inline bool is_vlan_dev(const struct net_device *dev) { return dev->priv_flags & IFF_802_1Q_VLAN; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3efe017fe419..1bb21ff0fa64 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3661,7 +3661,7 @@ extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; void netdev_rss_key_fill(void *buffer, size_t len); int dev_get_nest_level(struct net_device *dev, - bool (*type_check)(struct net_device *dev)); + bool (*type_check)(const struct net_device *dev)); int skb_checksum_help(struct sk_buff *skb); struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); @@ -3858,32 +3858,32 @@ static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, skb->mac_len = mac_len; } -static inline bool netif_is_macvlan(struct net_device *dev) +static inline bool netif_is_macvlan(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN; } -static inline bool netif_is_macvlan_port(struct net_device *dev) +static inline bool netif_is_macvlan_port(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN_PORT; } -static inline bool netif_is_ipvlan(struct net_device *dev) +static inline bool netif_is_ipvlan(const struct net_device *dev) { return dev->priv_flags & IFF_IPVLAN_SLAVE; } -static inline bool netif_is_ipvlan_port(struct net_device *dev) +static inline bool netif_is_ipvlan_port(const struct net_device *dev) { return dev->priv_flags & IFF_IPVLAN_MASTER; } -static inline bool netif_is_bond_master(struct net_device *dev) +static inline bool netif_is_bond_master(const struct net_device *dev) { return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING; } -static inline bool netif_is_bond_slave(struct net_device *dev) +static inline bool netif_is_bond_slave(const struct net_device *dev) { return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING; } @@ -3918,22 +3918,22 @@ static inline bool netif_is_ovs_master(const struct net_device *dev) return dev->priv_flags & IFF_OPENVSWITCH; } -static inline bool netif_is_team_master(struct net_device *dev) +static inline bool netif_is_team_master(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM; } -static inline bool netif_is_team_port(struct net_device *dev) +static inline bool netif_is_team_port(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM_PORT; } -static inline bool netif_is_lag_master(struct net_device *dev) +static inline bool netif_is_lag_master(const struct net_device *dev) { return netif_is_bond_master(dev) || netif_is_team_master(dev); } -static inline bool netif_is_lag_port(struct net_device *dev) +static inline bool netif_is_lag_port(const struct net_device *dev) { return netif_is_bond_slave(dev) || netif_is_team_port(dev); } -- cgit v1.2.3 From 5f61385d2ebc2bd62bc389c7da0d8d2f263be1eb Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Sun, 6 Dec 2015 18:07:41 +0200 Subject: net/mlx4_core: Keep VLAN/MAC tables mirrored in multifunc HA mode Due to HW limitations, indexes to MAC and VLAN tables are always taken from the table of the actual port. So, if a resource holds an index to a table, it may refer to different values during the lifetime of the resource, unless the tables are mirrored. Also, even when driver is not in HA mode the policy of allocating an index to these tables is such to make sure, as much as possible, that when the time comes the mirroring will be successful. This means that in multifunction mode the allocation of a free index in a port's table tries to make sure that the same index in the other's port table is also free. Signed-off-by: Moni Shoua Reviewed-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/mlx4/driver.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index 5a06d969338e..2e8af001c5da 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -75,6 +75,11 @@ static inline int mlx4_is_bonded(struct mlx4_dev *dev) return !!(dev->flags & MLX4_FLAG_BONDED); } +static inline int mlx4_is_mf_bonded(struct mlx4_dev *dev) +{ + return (mlx4_is_bonded(dev) && mlx4_is_mfunc(dev)); +} + struct mlx4_port_map { u8 port1; u8 port2; -- cgit v1.2.3 From ea3793ee29d3621faf857fa8ef5425e9ff9a756d Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Sun, 6 Dec 2015 21:11:34 +0000 Subject: core: enable more fine-grained datagram reception control The __skb_recv_datagram routine in core/ datagram.c provides a general skb reception factility supposed to be utilized by protocol modules providing datagram sockets. It encompasses both the actual recvmsg code and a surrounding 'sleep until data is available' loop. This is inconvenient if a protocol module has to use additional locking in order to maintain some per-socket state the generic datagram socket code is unaware of (as the af_unix code does). The patch below moves the recvmsg proper code into a new __skb_try_recv_datagram routine which doesn't sleep and renames wait_for_more_packets to __skb_wait_for_more_packets, both routines being exported interfaces. The original __skb_recv_datagram routine is reimplemented on top of these two functions such that its user-visible behaviour remains unchanged. Signed-off-by: Rainer Weikusat Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c9c394bf0771..9b9b9ead7bb3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2785,6 +2785,12 @@ static inline void skb_frag_list_init(struct sk_buff *skb) #define skb_walk_frags(skb, iter) \ for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next) + +int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p, + const struct sk_buff *skb); +struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags, + int *peeked, int *off, int *err, + struct sk_buff **last); struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, int *peeked, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, -- cgit v1.2.3 From 8ac2837c89c8c0fcad557e4380aeef80580390f9 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 9 Dec 2015 10:51:12 +0800 Subject: Revert "Merge branch 'vsock-virtio'" This reverts commit 0d76d6e8b2507983a2cae4c09880798079007421 and merge commit c402293bd76fbc93e52ef8c0947ab81eea3ae019, reversing changes made to c89359a42e2a49656451569c382eed63e781153c. The virtio-vsock device specification is not finalized yet. Michael Tsirkin voiced concerned about merging this code when the hardware interface (and possibly the userspace interface) could still change. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 209 ------------------------------------------- 1 file changed, 209 deletions(-) delete mode 100644 include/linux/virtio_vsock.h (limited to 'include/linux') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h deleted file mode 100644 index a5f3ecc038f7..000000000000 --- a/include/linux/virtio_vsock.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so - * anyone can use the definitions to implement compatible drivers/servers: - * - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of IBM nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Copyright (C) Red Hat, Inc., 2013-2015 - * Copyright (C) Asias He , 2013 - * Copyright (C) Stefan Hajnoczi , 2015 - */ - -#ifndef _LINUX_VIRTIO_VSOCK_H -#define _LINUX_VIRTIO_VSOCK_H - -#include -#include -#include - -#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE 128 -#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE (1024 * 256) -#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE (1024 * 256) -#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) -#define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL -#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) -#define VIRTIO_VSOCK_MAX_TX_BUF_SIZE (1024 * 1024 * 16) -#define VIRTIO_VSOCK_MAX_DGRAM_SIZE (1024 * 64) - -struct vsock_transport_recv_notify_data; -struct vsock_transport_send_notify_data; -struct sockaddr_vm; -struct vsock_sock; - -enum { - VSOCK_VQ_CTRL = 0, - VSOCK_VQ_RX = 1, /* for host to guest data */ - VSOCK_VQ_TX = 2, /* for guest to host data */ - VSOCK_VQ_MAX = 3, -}; - -/* virtio transport socket state */ -struct virtio_transport { - struct virtio_transport_pkt_ops *ops; - struct vsock_sock *vsk; - - u32 buf_size; - u32 buf_size_min; - u32 buf_size_max; - - struct mutex tx_lock; - struct mutex rx_lock; - - struct list_head rx_queue; - u32 rx_bytes; - - /* Protected by trans->tx_lock */ - u32 tx_cnt; - u32 buf_alloc; - u32 peer_fwd_cnt; - u32 peer_buf_alloc; - /* Protected by trans->rx_lock */ - u32 fwd_cnt; - - /* Protected by sk_lock */ - u16 dgram_id; - struct list_head incomplete_dgrams; /* dgram fragments */ -}; - -struct virtio_vsock_pkt { - struct virtio_vsock_hdr hdr; - struct virtio_transport *trans; - struct work_struct work; - struct list_head list; - void *buf; - u32 len; - u32 off; -}; - -struct virtio_vsock_pkt_info { - u32 remote_cid, remote_port; - struct msghdr *msg; - u32 pkt_len; - u16 type; - u16 op; - u32 flags; - u16 dgram_id; - u16 dgram_len; -}; - -struct virtio_transport_pkt_ops { - int (*send_pkt)(struct vsock_sock *vsk, - struct virtio_vsock_pkt_info *info); -}; - -void virtio_vsock_dumppkt(const char *func, - const struct virtio_vsock_pkt *pkt); - -struct sock * -virtio_transport_get_pending(struct sock *listener, - struct virtio_vsock_pkt *pkt); -struct virtio_vsock_pkt * -virtio_transport_alloc_pkt(struct vsock_sock *vsk, - struct virtio_vsock_pkt_info *info, - size_t len, - u32 src_cid, - u32 src_port, - u32 dst_cid, - u32 dst_port); -ssize_t -virtio_transport_stream_dequeue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len, - int type); -int -virtio_transport_dgram_dequeue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len, int flags); - -s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); -s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); - -int virtio_transport_do_socket_init(struct vsock_sock *vsk, - struct vsock_sock *psk); -u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk); -u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk); -void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val); -void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val); -int -virtio_transport_notify_poll_in(struct vsock_sock *vsk, - size_t target, - bool *data_ready_now); -int -virtio_transport_notify_poll_out(struct vsock_sock *vsk, - size_t target, - bool *space_available_now); - -int virtio_transport_notify_recv_init(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, - size_t target, struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, - size_t target, ssize_t copied, bool data_read, - struct vsock_transport_recv_notify_data *data); -int virtio_transport_notify_send_init(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, - struct vsock_transport_send_notify_data *data); -int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, - ssize_t written, struct vsock_transport_send_notify_data *data); - -u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk); -bool virtio_transport_stream_is_active(struct vsock_sock *vsk); -bool virtio_transport_stream_allow(u32 cid, u32 port); -int virtio_transport_dgram_bind(struct vsock_sock *vsk, - struct sockaddr_vm *addr); -bool virtio_transport_dgram_allow(u32 cid, u32 port); - -int virtio_transport_connect(struct vsock_sock *vsk); - -int virtio_transport_shutdown(struct vsock_sock *vsk, int mode); - -void virtio_transport_release(struct vsock_sock *vsk); - -ssize_t -virtio_transport_stream_enqueue(struct vsock_sock *vsk, - struct msghdr *msg, - size_t len); -int -virtio_transport_dgram_enqueue(struct vsock_sock *vsk, - struct sockaddr_vm *remote_addr, - struct msghdr *msg, - size_t len); - -void virtio_transport_destruct(struct vsock_sock *vsk); - -void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_inc_tx_pkt(struct virtio_vsock_pkt *pkt); -void virtio_transport_dec_tx_pkt(struct virtio_vsock_pkt *pkt); -u32 virtio_transport_get_credit(struct virtio_transport *trans, u32 wanted); -void virtio_transport_put_credit(struct virtio_transport *trans, u32 credit); -#endif /* _LINUX_VIRTIO_VSOCK_H */ -- cgit v1.2.3 From 2a56a1fec290bf0bc4676bbf4efdb3744953a3e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:52 -0500 Subject: net: wrap sock->sk_cgrp_prioidx and ->sk_classid inside a struct Introduce sock->sk_cgrp_data which is a struct sock_cgroup_data. ->sk_cgroup_prioidx and ->sk_classid are moved into it. The struct and its accessors are defined in cgroup-defs.h. This is to prepare for overloading the fields with a cgroup pointer. This patch mostly performs equivalent conversions but the followings are noteworthy. * Equality test before updating classid is removed from sock_update_classid(). This shouldn't make any noticeable difference and a similar test will be implemented on the helper side later. * sock_update_netprioidx() now takes struct sock_cgroup_data and can be moved to netprio_cgroup.h without causing include dependency loop. Moved. * The dummy version of sock_update_netprioidx() converted to a static inline function while at it. Signed-off-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 504d8591b6d3..ed128fed0335 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -542,4 +542,40 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #endif /* CONFIG_CGROUPS */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +struct sock_cgroup_data { + u16 prioidx; + u32 classid; +}; + +static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) +{ + return skcd->prioidx; +} + +static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) +{ + return skcd->classid; +} + +static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, + u16 prioidx) +{ + skcd->prioidx = prioidx; +} + +static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, + u32 classid) +{ + skcd->classid = classid; +} + +#else /* CONFIG_SOCK_CGROUP_DATA */ + +struct sock_cgroup_data { +}; + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + #endif /* _LINUX_CGROUP_DEFS_H */ -- cgit v1.2.3 From bd1060a1d67128bb8fbe2e1384c518912cbe54e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:53 -0500 Subject: sock, cgroup: add sock->sk_cgroup In cgroup v1, dealing with cgroup membership was difficult because the number of membership associations was unbound. As a result, cgroup v1 grew several controllers whose primary purpose is either tagging membership or pull in configuration knobs from other subsystems so that cgroup membership test can be avoided. net_cls and net_prio controllers are examples of the latter. They allow configuring network-specific attributes from cgroup side so that network subsystem can avoid testing cgroup membership; unfortunately, these are not only cumbersome but also problematic. Both net_cls and net_prio aren't properly hierarchical. Both inherit configuration from the parent on creation but there's no interaction afterwards. An ancestor doesn't restrict the behavior in its subtree in anyway and configuration changes aren't propagated downwards. Especially when combined with cgroup delegation, this is problematic because delegatees can mess up whatever network configuration implemented at the system level. net_prio would allow the delegatees to set whatever priority value regardless of CAP_NET_ADMIN and net_cls the same for classid. While it is possible to solve these issues from controller side by implementing hierarchical allowable ranges in both controllers, it would involve quite a bit of complexity in the controllers and further obfuscate network configuration as it becomes even more difficult to tell what's actually being configured looking from the network side. While not much can be done for v1 at this point, as membership handling is sane on cgroup v2, it'd be better to make cgroup matching behave like other network matches and classifiers than introducing further complications. In preparation, this patch updates sock->sk_cgrp_data handling so that it points to the v2 cgroup that sock was created in until either net_prio or net_cls is used. Once either of the two is used, sock->sk_cgrp_data reverts to its previous role of carrying prioidx and classid. This is to avoid adding yet another cgroup related field to struct sock. As the mode switching can happen at most once per boot, the switching mechanism is aimed at lowering hot path overhead. It may leak a finite, likely small, number of cgroup refs and report spurious prioidx or classid on switching; however, dynamic updates of prioidx and classid have always been racy and lossy - socks between creation and fd installation are never updated, config changes don't update existing sockets at all, and prioidx may index with dead and recycled cgroup IDs. Non-critical inaccuracies from small race windows won't make any noticeable difference. This patch doesn't make use of the pointer yet. The following patch will implement netfilter match for cgroup2 membership. v2: Use sock_cgroup_data to avoid inflating struct sock w/ another cgroup specific field. v3: Add comments explaining why sock_data_prioidx() and sock_data_classid() use different fallback values. Signed-off-by: Tejun Heo Cc: Daniel Borkmann Cc: Daniel Wagner CC: Neil Horman Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 88 +++++++++++++++++++++++++++++++++++++++++---- include/linux/cgroup.h | 41 +++++++++++++++++++++ 2 files changed, 123 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ed128fed0335..9dc226345e4e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -544,31 +544,107 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #ifdef CONFIG_SOCK_CGROUP_DATA +/* + * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains + * per-socket cgroup information except for memcg association. + * + * On legacy hierarchies, net_prio and net_cls controllers directly set + * attributes on each sock which can then be tested by the network layer. + * On the default hierarchy, each sock is associated with the cgroup it was + * created in and the networking layer can match the cgroup directly. + * + * To avoid carrying all three cgroup related fields separately in sock, + * sock_cgroup_data overloads (prioidx, classid) and the cgroup pointer. + * On boot, sock_cgroup_data records the cgroup that the sock was created + * in so that cgroup2 matches can be made; however, once either net_prio or + * net_cls starts being used, the area is overriden to carry prioidx and/or + * classid. The two modes are distinguished by whether the lowest bit is + * set. Clear bit indicates cgroup pointer while set bit prioidx and + * classid. + * + * While userland may start using net_prio or net_cls at any time, once + * either is used, cgroup2 matching no longer works. There is no reason to + * mix the two and this is in line with how legacy and v2 compatibility is + * handled. On mode switch, cgroup references which are already being + * pointed to by socks may be leaked. While this can be remedied by adding + * synchronization around sock_cgroup_data, given that the number of leaked + * cgroups is bound and highly unlikely to be high, this seems to be the + * better trade-off. + */ struct sock_cgroup_data { - u16 prioidx; - u32 classid; + union { +#ifdef __LITTLE_ENDIAN + struct { + u8 is_data; + u8 padding; + u16 prioidx; + u32 classid; + } __packed; +#else + struct { + u32 classid; + u16 prioidx; + u8 padding; + u8 is_data; + } __packed; +#endif + u64 val; + }; }; +/* + * There's a theoretical window where the following accessors race with + * updaters and return part of the previous pointer as the prioidx or + * classid. Such races are short-lived and the result isn't critical. + */ static inline u16 sock_cgroup_prioidx(struct sock_cgroup_data *skcd) { - return skcd->prioidx; + /* fallback to 1 which is always the ID of the root cgroup */ + return (skcd->is_data & 1) ? skcd->prioidx : 1; } static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) { - return skcd->classid; + /* fallback to 0 which is the unconfigured default classid */ + return (skcd->is_data & 1) ? skcd->classid : 0; } +/* + * If invoked concurrently, the updaters may clobber each other. The + * caller is responsible for synchronization. + */ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - skcd->prioidx = prioidx; + struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + + if (sock_cgroup_prioidx(&skcd_buf) == prioidx) + return; + + if (!(skcd_buf.is_data & 1)) { + skcd_buf.val = 0; + skcd_buf.is_data = 1; + } + + skcd_buf.prioidx = prioidx; + WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - skcd->classid = classid; + struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + + if (sock_cgroup_classid(&skcd_buf) == classid) + return; + + if (!(skcd_buf.is_data & 1)) { + skcd_buf.val = 0; + skcd_buf.is_data = 1; + } + + skcd_buf.classid = classid; + WRITE_ONCE(skcd->val, skcd_buf.val); /* see sock_cgroup_ptr() */ } #else /* CONFIG_SOCK_CGROUP_DATA */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4c3ffab81ba7..a8ba1ea0ea5a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -578,4 +578,45 @@ static inline int cgroup_init(void) { return 0; } #endif /* !CONFIG_CGROUPS */ +/* + * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data + * definition in cgroup-defs.h. + */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) +extern spinlock_t cgroup_sk_update_lock; +#endif + +void cgroup_sk_alloc_disable(void); +void cgroup_sk_alloc(struct sock_cgroup_data *skcd); +void cgroup_sk_free(struct sock_cgroup_data *skcd); + +static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) +{ +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) + unsigned long v; + + /* + * @skcd->val is 64bit but the following is safe on 32bit too as we + * just need the lower ulong to be written and read atomically. + */ + v = READ_ONCE(skcd->val); + + if (v & 1) + return &cgrp_dfl_root.cgrp; + + return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; +#else + return (struct cgroup *)(unsigned long)skcd->val; +#endif +} + +#else /* CONFIG_CGROUP_DATA */ + +static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} +static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} + +#endif /* CONFIG_CGROUP_DATA */ + #endif /* _LINUX_CGROUP_H */ -- cgit v1.2.3 From ad2c8c73d29702c3193f739390f6661f9a4ecad9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 9 Dec 2015 12:30:46 -0500 Subject: cgroup: fix sock_cgroup_data initialization on earlier compilers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sock_cgroup_data is a struct containing an anonymous union. sock_cgroup_set_prioidx() and sock_cgroup_set_classid() were initializing a field inside the anonymous union as follows. struct sock_ccgroup_data skcd_buf = { .val = VAL }; While this is fine on more recent compilers, gcc-4.4.7 triggers the following errors. include/linux/cgroup-defs.h: In function ‘sock_cgroup_set_prioidx’: include/linux/cgroup-defs.h:619: error: unknown field ‘val’ specified in initializer include/linux/cgroup-defs.h:619: warning: missing braces around initializer include/linux/cgroup-defs.h:619: warning: (near initialization for ‘skcd_buf.’) This is because .val belongs to the anonymous union nested inside the struct but the initializer is missing the nesting. Fix it by adding an extra pair of braces. Signed-off-by: Tejun Heo Reported-by: Alaa Hleihel Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 9dc226345e4e..097901a68671 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -616,7 +616,7 @@ static inline u32 sock_cgroup_classid(struct sock_cgroup_data *skcd) static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { - struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; if (sock_cgroup_prioidx(&skcd_buf) == prioidx) return; @@ -633,7 +633,7 @@ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { - struct sock_cgroup_data skcd_buf = { .val = READ_ONCE(skcd->val) }; + struct sock_cgroup_data skcd_buf = {{ .val = READ_ONCE(skcd->val) }}; if (sock_cgroup_classid(&skcd_buf) == classid) return; -- cgit v1.2.3 From 899077791403ff7a2d8cfaa87bd1a82d729463e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 8 Dec 2015 16:32:27 +0100 Subject: netcp: try to reduce type confusion in descriptors The netcp driver produces tons of warnings when CONFIG_LPAE is enabled on ARM: drivers/net/ethernet/ti/netcp_core.c: In function 'netcp_tx_map_skb': drivers/net/ethernet/ti/netcp_core.c:1084:13: warning: passing argument 1 of 'set_words' from incompatible pointer type [-Wincompatible-pointer-types] This is the result of trying to pass a pointer to a dma_addr_t to a function that expects a u32 pointer to copy that into a DMA descriptor. Looking at that code in more detail to fix the warnings, I see multiple related problems: * The conversion functions are not endian-safe, as the DMA descriptors are almost certainly fixed-endian, but the CPU is not. * On 64-bit machines, passing a pointer through a u32 variable is a bug, accessing an indirect pointer as a u32 pointer even more so. * The handling of epib and psdata mixes native-endian and device-endian data. In this patch, I try to sort out the types for most accesses here, adding le32_to_cpu/cpu_to_le32 where appropriate, and passing pointers through two 32-bit words in the descriptor padding, to make it plausible that the driver does the right thing if compiled for big-endian or 64-bit systems. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/soc/ti/knav_dma.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h index dad035c16d94..343c13ac4f71 100644 --- a/include/linux/soc/ti/knav_dma.h +++ b/include/linux/soc/ti/knav_dma.h @@ -144,17 +144,17 @@ struct knav_dma_cfg { * @psdata: Protocol specific */ struct knav_dma_desc { - u32 desc_info; - u32 tag_info; - u32 packet_info; - u32 buff_len; - u32 buff; - u32 next_desc; - u32 orig_len; - u32 orig_buff; - u32 epib[KNAV_DMA_NUM_EPIB_WORDS]; - u32 psdata[KNAV_DMA_NUM_PS_WORDS]; - u32 pad[4]; + __le32 desc_info; + __le32 tag_info; + __le32 packet_info; + __le32 buff_len; + __le32 buff; + __le32 next_desc; + __le32 orig_len; + __le32 orig_buff; + __le32 epib[KNAV_DMA_NUM_EPIB_WORDS]; + __le32 psdata[KNAV_DMA_NUM_PS_WORDS]; + __le32 pad[4]; } ____cacheline_aligned; #if IS_ENABLED(CONFIG_KEYSTONE_NAVIGATOR_DMA) -- cgit v1.2.3 From 26a8145390b36cbe97a5bd0b9e97249f21af6aea Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 10 Dec 2015 17:12:39 +0200 Subject: net/mlx5_core: Introduce flow steering firmware commands Introduce new Flow Steering (FS) firmware commands, in-order to support the new flow steering infrastructure. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 47 +++++++++++++++++++++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 32 ++++++++++++++++++++--------- 2 files changed, 69 insertions(+), 10 deletions(-) create mode 100644 include/linux/mlx5/fs.h (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h new file mode 100644 index 000000000000..34fd8dc0b3e1 --- /dev/null +++ b/include/linux/mlx5/fs.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2015, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MLX5_FS_ +#define _MLX5_FS_ + +#include + +struct mlx5_flow_table; + +struct mlx5_flow_destination { + enum mlx5_flow_destination_type type; + union { + u32 tir_num; + struct mlx5_flow_table *ft; + }; +}; +#endif diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f5d94495758a..131a2737cfa3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -256,25 +256,27 @@ struct mlx5_ifc_flow_table_fields_supported_bits { struct mlx5_ifc_flow_table_prop_layout_bits { u8 ft_support[0x1]; - u8 reserved_0[0x1f]; + u8 reserved_0[0x2]; + u8 flow_modify_en[0x1]; + u8 reserved_1[0x1c]; - u8 reserved_1[0x2]; + u8 reserved_2[0x2]; u8 log_max_ft_size[0x6]; - u8 reserved_2[0x10]; + u8 reserved_3[0x10]; u8 max_ft_level[0x8]; - u8 reserved_3[0x20]; + u8 reserved_4[0x20]; - u8 reserved_4[0x18]; + u8 reserved_5[0x18]; u8 log_max_ft_num[0x8]; - u8 reserved_5[0x18]; + u8 reserved_6[0x18]; u8 log_max_destination[0x8]; - u8 reserved_6[0x18]; + u8 reserved_7[0x18]; u8 log_max_flow[0x8]; - u8 reserved_7[0x40]; + u8 reserved_8[0x40]; struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support; @@ -2843,6 +2845,13 @@ struct mlx5_ifc_set_hca_cap_in_bits { union mlx5_ifc_hca_cap_union_bits capability; }; +enum { + MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION = 0x0, + MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_TAG = 0x1, + MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST = 0x2, + MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS = 0x3 +}; + struct mlx5_ifc_set_fte_out_bits { u8 status[0x8]; u8 reserved_0[0x18]; @@ -2867,11 +2876,14 @@ struct mlx5_ifc_set_fte_in_bits { u8 reserved_4[0x8]; u8 table_id[0x18]; - u8 reserved_5[0x40]; + u8 reserved_5[0x18]; + u8 modify_enable_mask[0x8]; + + u8 reserved_6[0x20]; u8 flow_index[0x20]; - u8 reserved_6[0xe0]; + u8 reserved_7[0xe0]; struct mlx5_ifc_flow_context_bits flow_context; }; -- cgit v1.2.3 From 2530236303d9e705db6a28eb9a10c8d79b288b37 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 10 Dec 2015 17:12:43 +0200 Subject: net/mlx5_core: Flow steering tree initialization Flow steering initialization is based on static tree which illustrates the flow steering tree when the driver is loaded. The initialization considers the max supported flow table level of the device, a minimum of 2 kernel flow tables(vlan and mac) are required to have kernel flow table functionality. The tree structures when the driver is loaded: root_namespace(receive nic) | priority-0 (kernel priority) | namespace(kernel namespace) | priority-0 (flow tables priority) In the following patches, When the EN driver will use the flow steering API, it create two flow tables and their flow groups under priority-0(flow tables priority). Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 2 ++ include/linux/mlx5/fs.h | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ac098b6b97bf..2fd7019f69db 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -502,6 +502,8 @@ struct mlx5_priv { struct mlx5_eswitch *eswitch; struct mlx5_core_sriov sriov; unsigned long pci_dev_data; + struct mlx5_flow_root_namespace *root_ns; + struct mlx5_flow_root_namespace *fdb_root_ns; }; enum mlx5_device_state { diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 34fd8dc0b3e1..16ae5233dc7b 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -35,6 +35,13 @@ #include +#define MLX5_FS_DEFAULT_FLOW_TAG 0x0 + +enum mlx5_flow_namespace_type { + MLX5_FLOW_NAMESPACE_KERNEL, + MLX5_FLOW_NAMESPACE_FDB, +}; + struct mlx5_flow_table; struct mlx5_flow_destination { @@ -42,6 +49,7 @@ struct mlx5_flow_destination { union { u32 tir_num; struct mlx5_flow_table *ft; + u32 vport_num; }; }; #endif -- cgit v1.2.3 From 86d722ad2c3bd2f0536b196b7fd67ae2a7e2a492 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 10 Dec 2015 17:12:44 +0200 Subject: net/mlx5: Use flow steering infrastructure for mlx5_en Expose the new flow steering API and remove the old one. Few changes are required: 1. The Ethernet flow steering follows the existing implementation, but uses the new steering API. The old flow steering implementation is removed. 2. Move the E-switch FDB management to use the new API. 3. When driver is loaded call to mlx5_init_fs which initialize the flow steering tree structure, open namespaces for NIC receive and for E-switch FDB. 4. Call to mlx5_cleanup_fs when the driver is unloaded. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/flow_table.h | 63 ----------------------------------------- include/linux/mlx5/fs.h | 38 +++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 63 deletions(-) delete mode 100644 include/linux/mlx5/flow_table.h (limited to 'include/linux') diff --git a/include/linux/mlx5/flow_table.h b/include/linux/mlx5/flow_table.h deleted file mode 100644 index 0f2a15cf3317..000000000000 --- a/include/linux/mlx5/flow_table.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef MLX5_FLOW_TABLE_H -#define MLX5_FLOW_TABLE_H - -#include - -struct mlx5_flow_table_group { - u8 log_sz; - u8 match_criteria_enable; - u32 match_criteria[MLX5_ST_SZ_DW(fte_match_param)]; -}; - -struct mlx5_flow_destination { - enum mlx5_flow_destination_type type; - union { - u32 tir_num; - void *ft; - u32 vport_num; - }; -}; - -void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type, - u16 num_groups, - struct mlx5_flow_table_group *group); -void mlx5_destroy_flow_table(void *flow_table); -int mlx5_add_flow_table_entry(void *flow_table, u8 match_criteria_enable, - void *match_criteria, void *flow_context, - u32 *flow_index); -void mlx5_del_flow_table_entry(void *flow_table, u32 flow_index); -u32 mlx5_get_flow_table_id(void *flow_table); - -#endif /* MLX5_FLOW_TABLE_H */ diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 16ae5233dc7b..bc7ad019afde 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -33,6 +33,7 @@ #ifndef _MLX5_FS_ #define _MLX5_FS_ +#include #include #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 @@ -43,6 +44,9 @@ enum mlx5_flow_namespace_type { }; struct mlx5_flow_table; +struct mlx5_flow_group; +struct mlx5_flow_rule; +struct mlx5_flow_namespace; struct mlx5_flow_destination { enum mlx5_flow_destination_type type; @@ -52,4 +56,38 @@ struct mlx5_flow_destination { u32 vport_num; }; }; + +struct mlx5_flow_namespace * +mlx5_get_flow_namespace(struct mlx5_core_dev *dev, + enum mlx5_flow_namespace_type type); + +struct mlx5_flow_table * +mlx5_create_flow_table(struct mlx5_flow_namespace *ns, + int prio, + int num_flow_table_entries); +int mlx5_destroy_flow_table(struct mlx5_flow_table *ft); + +/* inbox should be set with the following values: + * start_flow_index + * end_flow_index + * match_criteria_enable + * match_criteria + */ +struct mlx5_flow_group * +mlx5_create_flow_group(struct mlx5_flow_table *ft, u32 *in); +void mlx5_destroy_flow_group(struct mlx5_flow_group *fg); + +/* Single destination per rule. + * Group ID is implied by the match criteria. + */ +struct mlx5_flow_rule * +mlx5_add_flow_rule(struct mlx5_flow_table *ft, + u8 match_criteria_enable, + u32 *match_criteria, + u32 *match_value, + u32 action, + u32 flow_tag, + struct mlx5_flow_destination *dest); +void mlx5_del_flow_rule(struct mlx5_flow_rule *fr); + #endif -- cgit v1.2.3 From bda13fed677bdb423b97dcf054f68b9eb4c6dbfb Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sun, 13 Dec 2015 16:53:02 +0900 Subject: net: Fix typo in skb_fclone_busy This patch fix a typo found within comment of skb_fclone_busy. Signed-off-by: Masanari Iida Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9b9b9ead7bb3..af4f6ac025b6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -833,7 +833,7 @@ struct sk_buff_fclones { * skb_fclone_busy - check if fclone is busy * @skb: buffer * - * Returns true is skb is a fast clone, and its clone is not freed. + * Returns true if skb is a fast clone, and its clone is not freed. * Some drivers call skb_orphan() in their ndo_start_xmit(), * so we also check that this didnt happen. */ -- cgit v1.2.3 From 55dc5a9f2f2afd32d7b1bda44a5fc95e67a3371f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:40 -0800 Subject: net: Add skb_inner_transport_offset function Same thing as skb_transport_offset but returns the offset of the inner transport header (when skb->encpasulation is set). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index af4f6ac025b6..2393373c9d08 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1939,6 +1939,11 @@ static inline unsigned char *skb_inner_transport_header(const struct sk_buff return skb->head + skb->inner_transport_header; } +static inline int skb_inner_transport_offset(const struct sk_buff *skb) +{ + return skb_inner_transport_header(skb) - skb->data; +} + static inline void skb_reset_inner_transport_header(struct sk_buff *skb) { skb->inner_transport_header = skb->data - skb->head; -- cgit v1.2.3 From 53692b1de419c1b59106909c7f6b4dd3dbc768ac Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:41 -0800 Subject: sctp: Rename NETIF_F_SCTP_CSUM to NETIF_F_SCTP_CRC The SCTP checksum is really a CRC and is very different from the standards 1's complement checksum that serves as the checksum for IP protocols. This offload interface is also very different. Rename NETIF_F_SCTP_CSUM to NETIF_F_SCTP_CRC to highlight these differences. The term CSUM should be reserved in the stack to refer to the standard 1's complement IP checksum. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index f0d87347df19..6395f8309393 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -52,7 +52,7 @@ enum { NETIF_F_GSO_TUNNEL_REMCSUM_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ - NETIF_F_SCTP_CSUM_BIT, /* SCTP checksum offload */ + NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ NETIF_F_FCOE_MTU_BIT, /* Supports max FCoE MTU, 2158 bytes*/ NETIF_F_NTUPLE_BIT, /* N-tuple filters supported */ NETIF_F_RXHASH_BIT, /* Receive hashing offload */ @@ -103,7 +103,7 @@ enum { #define NETIF_F_NTUPLE __NETIF_F(NTUPLE) #define NETIF_F_RXCSUM __NETIF_F(RXCSUM) #define NETIF_F_RXHASH __NETIF_F(RXHASH) -#define NETIF_F_SCTP_CSUM __NETIF_F(SCTP_CSUM) +#define NETIF_F_SCTP_CRC __NETIF_F(SCTP_CRC) #define NETIF_F_SG __NETIF_F(SG) #define NETIF_F_TSO6 __NETIF_F(TSO6) #define NETIF_F_TSO_ECN __NETIF_F(TSO_ECN) -- cgit v1.2.3 From a188222b6ed29404ac2d4232d35d1fe0e77af370 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:43 -0800 Subject: net: Rename NETIF_F_ALL_CSUM to NETIF_F_CSUM_MASK The name NETIF_F_ALL_CSUM is a misnomer. This does not correspond to the set of features for offloading all checksums. This is a mask of the checksum offload related features bits. It is incorrect to set both NETIF_F_HW_CSUM and NETIF_F_IP_CSUM or NETIF_F_IPV6 at the same time for features of a device. This patch: - Changes instances of NETIF_F_ALL_CSUM to NETIF_F_CSUM_MASK (where NETIF_F_ALL_CSUM is being used as a mask). - Changes bonding, sfc/efx, ipvlan, macvlan, vlan, and team drivers to use NEITF_F_HW_CSUM in features list instead of NETIF_F_ALL_CSUM. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 7 ++++++- include/linux/netdevice.h | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 6395f8309393..2c4e94ab88da 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -149,7 +149,12 @@ enum { #define NETIF_F_GEN_CSUM NETIF_F_HW_CSUM #define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM) #define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM) -#define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM) + +/* List of IP checksum features. Note that NETIF_HW_CSUM should not be + * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set-- + * this would be contradictory + */ +#define NETIF_F_CSUM_MASK (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM) #define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1bb21ff0fa64..a54223a113b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3763,12 +3763,12 @@ static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { if (f1 & NETIF_F_GEN_CSUM) - f1 |= (NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + f1 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); if (f2 & NETIF_F_GEN_CSUM) - f2 |= (NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + f2 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); f1 &= f2; if (f1 & NETIF_F_GEN_CSUM) - f1 &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + f1 &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); return f1; } -- cgit v1.2.3 From c8cd0989bd151fda87bbf10887b3df18021284bc Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:44 -0800 Subject: net: Eliminate NETIF_F_GEN_CSUM and NETIF_F_V[46]_CSUM These netif flags are unnecessary convolutions. It is more straightforward to just use NETIF_F_HW_CSUM, NETIF_F_IP_CSUM, and NETIF_F_IPV6_CSUM directly. This patch also: - Cleans up can_checksum_protocol - Simplifies netdev_intersect_features Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 2 +- include/linux/netdev_features.h | 9 +++------ include/linux/netdevice.h | 40 +++++++++++++++++++++++++--------------- 3 files changed, 29 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 05f5879821b8..a5f6ce6b578c 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -621,7 +621,7 @@ static inline netdev_features_t vlan_features_check(const struct sk_buff *skb, NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | - NETIF_F_GEN_CSUM | + NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 2c4e94ab88da..d9654f0eecb3 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -146,15 +146,12 @@ enum { #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | \ NETIF_F_TSO6 | NETIF_F_UFO) -#define NETIF_F_GEN_CSUM NETIF_F_HW_CSUM -#define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM) -#define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM) - -/* List of IP checksum features. Note that NETIF_HW_CSUM should not be +/* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set-- * this would be contradictory */ -#define NETIF_F_CSUM_MASK (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM) +#define NETIF_F_CSUM_MASK (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \ + NETIF_F_HW_CSUM) #define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a54223a113b1..283984b67cd9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3691,13 +3691,24 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth); static inline bool can_checksum_protocol(netdev_features_t features, __be16 protocol) { - return ((features & NETIF_F_GEN_CSUM) || - ((features & NETIF_F_V4_CSUM) && - protocol == htons(ETH_P_IP)) || - ((features & NETIF_F_V6_CSUM) && - protocol == htons(ETH_P_IPV6)) || - ((features & NETIF_F_FCOE_CRC) && - protocol == htons(ETH_P_FCOE))); + if (protocol == htons(ETH_P_FCOE)) + return !!(features & NETIF_F_FCOE_CRC); + + /* Assume this is an IP checksum (not SCTP CRC) */ + + if (features & NETIF_F_HW_CSUM) { + /* Can checksum everything */ + return true; + } + + switch (protocol) { + case htons(ETH_P_IP): + return !!(features & NETIF_F_IP_CSUM); + case htons(ETH_P_IPV6): + return !!(features & NETIF_F_IPV6_CSUM); + default: + return false; + } } #ifdef CONFIG_BUG @@ -3762,15 +3773,14 @@ void linkwatch_run_queue(void); static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { - if (f1 & NETIF_F_GEN_CSUM) - f1 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); - if (f2 & NETIF_F_GEN_CSUM) - f2 |= (NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); - f1 &= f2; - if (f1 & NETIF_F_GEN_CSUM) - f1 &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM); + if ((f1 ^ f2) & NETIF_F_HW_CSUM) { + if (f1 & NETIF_F_HW_CSUM) + f1 |= (NETIF_F_IP_CSUM|NETIF_F_IP_CSUM); + else + f2 |= (NETIF_F_IP_CSUM|NETIF_F_IP_CSUM); + } - return f1; + return f1 & f2; } static inline netdev_features_t netdev_get_wanted_features( -- cgit v1.2.3 From 6ae23ad36253a8033c5714c52b691b84456487c5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:46 -0800 Subject: net: Add driver helper functions to determine checksum offloadability Add skb_csum_offload_chk driver helper function to determine if a device with limited checksum offload capabilities is able to offload the checksum for a given packet. This patch includes: - The skb_csum_offload_chk function. Returns true if checksum is offloadable, else false. Optionally, in the case that the checksum is not offloable, the function can call skb_checksum_help to resolve the checksum. skb_csum_offload_chk also returns whether the checksum refers to an encapsulated checksum. - Definition of skb_csum_offl_spec structure that caller uses to indicate rules about what it can offload (e.g. IPv4/v6, TCP/UDP only, whether encapsulated checksums can be offloaded, whether checksum with IPv6 extension headers can be offloaded). - Ancilary functions called skb_csum_offload_chk_help, skb_csum_off_chk_help_cmn, skb_csum_off_chk_help_cmn_v4_only. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 78 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 283984b67cd9..9fb6395967de 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2522,6 +2522,71 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, remcsum_unadjust((__sum16 *)ptr, grc->delta); } +struct skb_csum_offl_spec { + __u16 ipv4_okay:1, + ipv6_okay:1, + encap_okay:1, + ip_options_okay:1, + ext_hdrs_okay:1, + tcp_okay:1, + udp_okay:1, + sctp_okay:1, + vlan_okay:1, + no_encapped_ipv6:1, + no_not_encapped:1; +}; + +bool __skb_csum_offload_chk(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec, + bool *csum_encapped, + bool csum_help); + +static inline bool skb_csum_offload_chk(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec, + bool *csum_encapped, + bool csum_help) +{ + if (skb->ip_summed != CHECKSUM_PARTIAL) + return false; + + return __skb_csum_offload_chk(skb, spec, csum_encapped, csum_help); +} + +static inline bool skb_csum_offload_chk_help(struct sk_buff *skb, + const struct skb_csum_offl_spec *spec) +{ + bool csum_encapped; + + return skb_csum_offload_chk(skb, spec, &csum_encapped, true); +} + +static inline bool skb_csum_off_chk_help_cmn(struct sk_buff *skb) +{ + static const struct skb_csum_offl_spec csum_offl_spec = { + .ipv4_okay = 1, + .ip_options_okay = 1, + .ipv6_okay = 1, + .vlan_okay = 1, + .tcp_okay = 1, + .udp_okay = 1, + }; + + return skb_csum_offload_chk_help(skb, &csum_offl_spec); +} + +static inline bool skb_csum_off_chk_help_cmn_v4_only(struct sk_buff *skb) +{ + static const struct skb_csum_offl_spec csum_offl_spec = { + .ipv4_okay = 1, + .ip_options_okay = 1, + .tcp_okay = 1, + .udp_okay = 1, + .vlan_okay = 1, + }; + + return skb_csum_offload_chk_help(skb, &csum_offl_spec); +} + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, @@ -3711,6 +3776,19 @@ static inline bool can_checksum_protocol(netdev_features_t features, } } +/* Map an ethertype into IP protocol if possible */ +static inline int eproto_to_ipproto(int eproto) +{ + switch (eproto) { + case htons(ETH_P_IP): + return IPPROTO_IP; + case htons(ETH_P_IPV6): + return IPPROTO_IPV6; + default: + return -1; + } +} + #ifdef CONFIG_BUG void netdev_rx_csum_fault(struct net_device *dev); #else -- cgit v1.2.3 From 7a6ae71b2490586ed55105893a18dfc648e5fcbb Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 14 Dec 2015 11:19:47 -0800 Subject: net: Elaborate on checksum offload interface description Add specifics and details the description of the interface between the stack and drivers for doing checksum offload. This description is meant to be as specific and complete as possible. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 138 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 109 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2393373c9d08..6b6bd42d6134 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -39,11 +39,55 @@ #include #include -/* A. Checksumming of received packets by device. +/* The interface for checksum offload between the stack and networking drivers + * is as follows... + * + * A. IP checksum related features + * + * Drivers advertise checksum offload capabilities in the features of a device. + * From the stack's point of view these are capabilities offered by the driver, + * a driver typically only advertises features that it is capable of offloading + * to its device. + * + * The checksum related features are: + * + * NETIF_F_HW_CSUM - The driver (or its device) is able to compute one + * IP (one's complement) checksum for any combination + * of protocols or protocol layering. The checksum is + * computed and set in a packet per the CHECKSUM_PARTIAL + * interface (see below). + * + * NETIF_F_IP_CSUM - Driver (device) is only able to checksum plain + * TCP or UDP packets over IPv4. These are specifically + * unencapsulated packets of the form IPv4|TCP or + * IPv4|UDP where the Protocol field in the IPv4 header + * is TCP or UDP. The IPv4 header may contain IP options + * This feature cannot be set in features for a device + * with NETIF_F_HW_CSUM also set. This feature is being + * DEPRECATED (see below). + * + * NETIF_F_IPV6_CSUM - Driver (device) is only able to checksum plain + * TCP or UDP packets over IPv6. These are specifically + * unencapsulated packets of the form IPv6|TCP or + * IPv4|UDP where the Next Header field in the IPv6 + * header is either TCP or UDP. IPv6 extension headers + * are not supported with this feature. This feature + * cannot be set in features for a device with + * NETIF_F_HW_CSUM also set. This feature is being + * DEPRECATED (see below). + * + * NETIF_F_RXCSUM - Driver (device) performs receive checksum offload. + * This flag is used only used to disable the RX checksum + * feature for a device. The stack will accept receive + * checksum indication in packets received on a device + * regardless of whether NETIF_F_RXCSUM is set. + * + * B. Checksumming of received packets by device. Indication of checksum + * verification is in set skb->ip_summed. Possible values are: * * CHECKSUM_NONE: * - * Device failed to checksum this packet e.g. due to lack of capabilities. + * Device did not checksum this packet e.g. due to lack of capabilities. * The packet contains full (though not verified) checksum in packet but * not in skb->csum. Thus, skb->csum is undefined in this case. * @@ -53,9 +97,8 @@ * (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums * for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY * if their checksums are okay. skb->csum is still undefined in this case - * though. It is a bad option, but, unfortunately, nowadays most vendors do - * this. Apparently with the secret goal to sell you new devices, when you - * will add new protocol to your host, f.e. IPv6 8) + * though. A driver or device must never modify the checksum field in the + * packet even if checksum is verified. * * CHECKSUM_UNNECESSARY is applicable to following protocols: * TCP: IPv6 and IPv4. @@ -96,40 +139,77 @@ * packet that are after the checksum being offloaded are not considered to * be verified. * - * B. Checksumming on output. - * - * CHECKSUM_NONE: - * - * The skb was already checksummed by the protocol, or a checksum is not - * required. + * C. Checksumming on transmit for non-GSO. The stack requests checksum offload + * in the skb->ip_summed for a packet. Values are: * * CHECKSUM_PARTIAL: * - * The device is required to checksum the packet as seen by hard_start_xmit() + * The driver is required to checksum the packet as seen by hard_start_xmit() * from skb->csum_start up to the end, and to record/write the checksum at - * offset skb->csum_start + skb->csum_offset. + * offset skb->csum_start + skb->csum_offset. A driver may verify that the + * csum_start and csum_offset values are valid values given the length and + * offset of the packet, however they should not attempt to validate that the + * checksum refers to a legitimate transport layer checksum-- it is the + * purview of the stack to validate that csum_start and csum_offset are set + * correctly. + * + * When the stack requests checksum offload for a packet, the driver MUST + * ensure that the checksum is set correctly. A driver can either offload the + * checksum calculation to the device, or call skb_checksum_help (in the case + * that the device does not support offload for a particular checksum). + * + * NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM are being deprecated in favor of + * NETIF_F_HW_CSUM. New devices should use NETIF_F_HW_CSUM to indicate + * checksum offload capability. If a device has limited checksum capabilities + * (for instance can only perform NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM as + * described above) a helper function can be called to resolve + * CHECKSUM_PARTIAL. The helper functions are skb_csum_off_chk*. The helper + * function takes a spec argument that describes the protocol layer that is + * supported for checksum offload and can be called for each packet. If a + * packet does not match the specification for offload, skb_checksum_help + * is called to resolve the checksum. * - * The device must show its capabilities in dev->features, set up at device - * setup time, e.g. netdev_features.h: + * CHECKSUM_NONE: * - * NETIF_F_HW_CSUM - It's a clever device, it's able to checksum everything. - * NETIF_F_IP_CSUM - Device is dumb, it's able to checksum only TCP/UDP over - * IPv4. Sigh. Vendors like this way for an unknown reason. - * Though, see comment above about CHECKSUM_UNNECESSARY. 8) - * NETIF_F_IPV6_CSUM - About as dumb as the last one but does IPv6 instead. - * NETIF_F_... - Well, you get the picture. + * The skb was already checksummed by the protocol, or a checksum is not + * required. * * CHECKSUM_UNNECESSARY: * - * Normally, the device will do per protocol specific checksumming. Protocol - * implementations that do not want the NIC to perform the checksum - * calculation should use this flag in their outgoing skbs. + * This has the same meaning on as CHECKSUM_NONE for checksum offload on + * output. * - * NETIF_F_FCOE_CRC - This indicates that the device can do FCoE FC CRC - * offload. Correspondingly, the FCoE protocol driver - * stack should use CHECKSUM_UNNECESSARY. - * - * Any questions? No questions, good. --ANK + * CHECKSUM_COMPLETE: + * Not used in checksum output. If a driver observes a packet with this value + * set in skbuff, if should treat as CHECKSUM_NONE being set. + * + * D. Non-IP checksum (CRC) offloads + * + * NETIF_F_SCTP_CRC - This feature indicates that a device is capable of + * offloading the SCTP CRC in a packet. To perform this offload the stack + * will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset + * accordingly. Note the there is no indication in the skbuff that the + * CHECKSUM_PARTIAL refers to an SCTP checksum, a driver that supports + * both IP checksum offload and SCTP CRC offload must verify which offload + * is configured for a packet presumably by inspecting packet headers. + * + * NETIF_F_FCOE_CRC - This feature indicates that a device is capable of + * offloading the FCOE CRC in a packet. To perform this offload the stack + * will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset + * accordingly. Note the there is no indication in the skbuff that the + * CHECKSUM_PARTIAL refers to an FCOE checksum, a driver that supports + * both IP checksum offload and FCOE CRC offload must verify which offload + * is configured for a packet presumably by inspecting packet headers. + * + * E. Checksumming on output with GSO. + * + * In the case of a GSO packet (skb_is_gso(skb) is true), checksum offload + * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the + * gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as + * part of the GSO operation is implied. If a checksum is being offloaded + * with GSO then ip_summed is CHECKSUM_PARTIAL, csum_start and csum_offset + * are set to refer to the outermost checksum being offload (two offloaded + * checksums are possible with UDP encapsulation). */ /* Don't change this without changing skb_csum_unnecessary! */ -- cgit v1.2.3 From 3502cad73c4bbf8f6365d539e814159275252c59 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 15 Dec 2015 15:41:36 -0800 Subject: rhashtable: add function to replace an element Add the rhashtable_replace_fast function. This replaces one object in the table with another atomically. The hashes of the new and old objects must be equal. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 843ceca9a21e..77deece15fb3 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -819,4 +819,86 @@ out: return err; } +/* Internal function, please use rhashtable_replace_fast() instead */ +static inline int __rhashtable_replace_fast( + struct rhashtable *ht, struct bucket_table *tbl, + struct rhash_head *obj_old, struct rhash_head *obj_new, + const struct rhashtable_params params) +{ + struct rhash_head __rcu **pprev; + struct rhash_head *he; + spinlock_t *lock; + unsigned int hash; + int err = -ENOENT; + + /* Minimally, the old and new objects must have same hash + * (which should mean identifiers are the same). + */ + hash = rht_head_hashfn(ht, tbl, obj_old, params); + if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) + return -EINVAL; + + lock = rht_bucket_lock(tbl, hash); + + spin_lock_bh(lock); + + pprev = &tbl->buckets[hash]; + rht_for_each(he, tbl, hash) { + if (he != obj_old) { + pprev = &he->next; + continue; + } + + rcu_assign_pointer(obj_new->next, obj_old->next); + rcu_assign_pointer(*pprev, obj_new); + err = 0; + break; + } + + spin_unlock_bh(lock); + + return err; +} + +/** + * rhashtable_replace_fast - replace an object in hash table + * @ht: hash table + * @obj_old: pointer to hash head inside object being replaced + * @obj_new: pointer to hash head inside object which is new + * @params: hash table parameters + * + * Replacing an object doesn't affect the number of elements in the hash table + * or bucket, so we don't need to worry about shrinking or expanding the + * table here. + * + * Returns zero on success, -ENOENT if the entry could not be found, + * -EINVAL if hash is not the same for the old and new objects. + */ +static inline int rhashtable_replace_fast( + struct rhashtable *ht, struct rhash_head *obj_old, + struct rhash_head *obj_new, + const struct rhashtable_params params) +{ + struct bucket_table *tbl; + int err; + + rcu_read_lock(); + + tbl = rht_dereference_rcu(ht->tbl, ht); + + /* Because we have already taken (and released) the bucket + * lock in old_tbl, if we find that future_tbl is not yet + * visible then that guarantees the entry to still be in + * the old tbl if it exists. + */ + while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, + obj_new, params)) && + (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) + ; + + rcu_read_unlock(); + + return err; +} + #endif /* _LINUX_RHASHTABLE_H */ -- cgit v1.2.3 From fc9e50f5a5a4e1fa9ba2756f745a13e693cf6a06 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 15 Dec 2015 15:41:37 -0800 Subject: netlink: add a start callback for starting a netlink dump The start callback allows the caller to set up a context for the dump callbacks. Presumably, the context can then be destroyed in the done callback. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 639e9b8b0e4d..0b41959aab9f 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -131,6 +131,7 @@ netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; + int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); @@ -153,6 +154,7 @@ struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags); struct netlink_dump_control { + int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *skb, struct netlink_callback *); int (*done)(struct netlink_callback *); void *data; -- cgit v1.2.3 From b613f56ec9baf30edf5d9d607b822532a273dad7 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 16 Dec 2015 12:30:02 +0900 Subject: net: diag: split inet_diag_dump_one_icsk into two Currently, inet_diag_dump_one_icsk finds a socket and then dumps its information to userspace. Split it into a part that finds the socket and a part that dumps the information. Signed-off-by: Lorenzo Colitti Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 0e707f0c1a3e..e7032f041982 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -3,6 +3,7 @@ #include +struct net; struct sock; struct inet_hashinfo; struct nlattr; @@ -41,6 +42,10 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, const struct nlmsghdr *nlh, const struct inet_diag_req_v2 *req); +struct sock *inet_diag_find_one_icsk(struct net *net, + struct inet_hashinfo *hashinfo, + const struct inet_diag_req_v2 *req); + int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); extern int inet_diag_register(const struct inet_diag_handler *handler); -- cgit v1.2.3 From 64be0aed59ad519d6f2160868734f7e278290ac1 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 16 Dec 2015 12:30:03 +0900 Subject: net: diag: Add the ability to destroy a socket. This patch adds a SOCK_DESTROY operation, a destroy function pointer to sock_diag_handler, and a diag_destroy function pointer. It does not include any implementation code. Signed-off-by: Lorenzo Colitti Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/sock_diag.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index fddebc617469..4018b48f2b3b 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -15,6 +15,7 @@ struct sock_diag_handler { __u8 family; int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh); int (*get_info)(struct sk_buff *skb, struct sock *sk); + int (*destroy)(struct sk_buff *skb, struct nlmsghdr *nlh); }; int sock_diag_register(const struct sock_diag_handler *h); @@ -68,4 +69,5 @@ bool sock_diag_has_destroy_listeners(const struct sock *sk) } void sock_diag_broadcast_destroy(struct sock *sk); +int sock_diag_destroy(struct sock *sk, int err); #endif -- cgit v1.2.3 From 6eb5d2e08f071c05ecbe135369c9ad418826cab2 Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 16 Dec 2015 12:30:04 +0900 Subject: net: diag: Support SOCK_DESTROY for inet sockets. This passes the SOCK_DESTROY operation to the underlying protocol diag handler, or returns -EOPNOTSUPP if that handler does not define a destroy operation. Most of this patch is just renaming functions. This is not strictly necessary, but it would be fairly counterintuitive to have the code to destroy inet sockets be in a function whose name starts with inet_diag_get. Signed-off-by: Lorenzo Colitti Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index e7032f041982..7c27fa1030e8 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -24,6 +24,10 @@ struct inet_diag_handler { void (*idiag_get_info)(struct sock *sk, struct inet_diag_msg *r, void *info); + + int (*destroy)(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req); + __u16 idiag_type; __u16 idiag_info_size; }; -- cgit v1.2.3 From 541c9a84cd85203244307d9ebb821102eed82789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 9 Dec 2015 23:36:51 +0100 Subject: ssb: pick SoC invariants code from MIPS BCM47xx arch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is code in ssb fetching "invariants" that is basically a set of board specific data. Every host requires its own implementation of reading function. In ssb we have support for PCI, PCMCIA & SDIO. For some (historical?) reason code reading "invariants" for SoC was placed in arch code and provided by a callback. This is not needed nowadays, so lets move that into ssb. This way we keep all "invariants" functions in a single module making code cleaner. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/ssb/ssb.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h index c3d1a525bacc..26a0b3c3ce5f 100644 --- a/include/linux/ssb/ssb.h +++ b/include/linux/ssb/ssb.h @@ -524,13 +524,9 @@ struct ssb_init_invariants { typedef int (*ssb_invariants_func_t)(struct ssb_bus *bus, struct ssb_init_invariants *iv); -/* Register a SSB system bus. get_invariants() is called after the - * basic system devices are initialized. - * The invariants are usually fetched from some NVRAM. - * Put the invariants into the struct pointed to by iv. */ -extern int ssb_bus_ssbbus_register(struct ssb_bus *bus, - unsigned long baseaddr, - ssb_invariants_func_t get_invariants); +/* Register SoC bus. */ +extern int ssb_bus_host_soc_register(struct ssb_bus *bus, + unsigned long baseaddr); #ifdef CONFIG_SSB_PCIHOST extern int ssb_bus_pcibus_register(struct ssb_bus *bus, struct pci_dev *host_pci); -- cgit v1.2.3 From a8170d2b9e8d38a1f3fa3b40b6f8cd34a87d5382 Mon Sep 17 00:00:00 2001 From: "Singhai, Anjali" Date: Mon, 14 Dec 2015 12:21:17 -0800 Subject: geneve: Add geneve udp port offload for ethernet devices Add ndo_ops to add/del UDP ports to a device that supports geneve offload. v2: Comment fix. Signed-off-by: Anjali Singhai Jain Signed-off-by: Kiran Patil Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9fb6395967de..81b26a543a3c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1013,6 +1013,19 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * a new port starts listening. The operation is protected by the * vxlan_net->sock_lock. * + * void (*ndo_add_geneve_port)(struct net_device *dev, + * sa_family_t sa_family, __be16 port); + * Called by geneve to notify a driver about the UDP port and socket + * address family that geneve is listnening to. It is called only when + * a new port starts listening. The operation is protected by the + * geneve_net->sock_lock. + * + * void (*ndo_del_geneve_port)(struct net_device *dev, + * sa_family_t sa_family, __be16 port); + * Called by geneve to notify the driver about a UDP port and socket + * address family that geneve is not listening to anymore. The operation + * is protected by the geneve_net->sock_lock. + * * void (*ndo_del_vxlan_port)(struct net_device *dev, * sa_family_t sa_family, __be16 port); * Called by vxlan to notify the driver about a UDP port and socket @@ -1217,7 +1230,12 @@ struct net_device_ops { void (*ndo_del_vxlan_port)(struct net_device *dev, sa_family_t sa_family, __be16 port); - + void (*ndo_add_geneve_port)(struct net_device *dev, + sa_family_t sa_family, + __be16 port); + void (*ndo_del_geneve_port)(struct net_device *dev, + sa_family_t sa_family, + __be16 port); void* (*ndo_dfwd_add_station)(struct net_device *pdev, struct net_device *dev); void (*ndo_dfwd_del_station)(struct net_device *pdev, -- cgit v1.2.3 From 7b8002a1511fcbcb0596cac90d67ad5c8182d0aa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 15 Dec 2015 18:41:56 +0100 Subject: netfilter: nfnetlink: pass down netns pointer to call() and call_rcu() Adapt callsites to avoid recurrent lookup of the netns pointer. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 5646b24bfc64..ceacbf5dcb73 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -8,12 +8,12 @@ #include struct nfnl_callback { - int (*call)(struct sock *nl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[]); - int (*call_rcu)(struct sock *nl, struct sk_buff *skb, + int (*call)(struct net *net, struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[]); + int (*call_rcu)(struct net *net, struct sock *nl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]); int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[]); -- cgit v1.2.3 From 5913beaf0d70f97135ed7191c028fd88b3848864 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 15 Dec 2015 19:41:57 +0100 Subject: netfilter: nfnetlink: pass down netns pointer to commit() and abort() callbacks Adapt callsites to avoid recurrent lookup of the netns pointer. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index ceacbf5dcb73..ba0d9789eb6e 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -26,8 +26,8 @@ struct nfnetlink_subsystem { __u8 subsys_id; /* nfnetlink subsystem ID */ __u8 cb_count; /* number of callbacks */ const struct nfnl_callback *cb; /* callback for individual types */ - int (*commit)(struct sk_buff *skb); - int (*abort)(struct sk_buff *skb); + int (*commit)(struct net *net, struct sk_buff *skb); + int (*abort)(struct net *net, struct sk_buff *skb); }; int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); -- cgit v1.2.3 From 7be4fb643ef2d1058b897ba9dbe17bf5ced04391 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Wed, 23 Dec 2015 23:45:24 +0100 Subject: nfc: microread: Fix header comment microread platform_data header had an NXP header. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- include/linux/platform_data/microread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/microread.h b/include/linux/platform_data/microread.h index cfda59b226ee..ca13992089b8 100644 --- a/include/linux/platform_data/microread.h +++ b/include/linux/platform_data/microread.h @@ -1,5 +1,5 @@ /* - * Driver include for the PN544 NFC chip. + * Driver include for the Inside Secure microread NFC Chip. * * Copyright (C) 2011 Tieto Poland * Copyright (C) 2012 Intel Corporation. All rights reserved. -- cgit v1.2.3 From f3a4094558ddf8afa8bb58250d548e15e059c65a Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 30 Dec 2015 16:28:25 +0100 Subject: ethtool: Add phy statistics Ethernet PHYs can maintain statistics, for example errors while idle and receive errors. Add an ethtool mechanism to retrieve these statistics, using the same model as MAC statistics. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 05fde31b6dc6..a89cb0eef911 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -589,6 +589,12 @@ struct phy_driver { int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data); + /* Get statistics from the phy using ethtool */ + int (*get_sset_count)(struct phy_device *dev); + void (*get_strings)(struct phy_device *dev, u8 *data); + void (*get_stats)(struct phy_device *dev, + struct ethtool_stats *stats, u64 *data); + struct device_driver driver; }; #define to_phy_driver(d) container_of(d, struct phy_driver, driver) -- cgit v1.2.3 From 888cc8c20cf265fcd1302f6c5d6be07628ba66c7 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Mon, 28 Dec 2015 02:07:08 +0300 Subject: sh_eth: remove EDMAC_BIG_ENDIAN Commit 71557a37adb5 ("[netdrvr] sh_eth: Add SH7619 support") added support for the big-endian EDMAC descriptors. However, it was never used and never worked right until the recent driver fixes. I think we now can just remove this support, it was only burdening the driver from the start. It should be easy to do without disturbing the SH platform code, at least for now... Signed-off-by: Sergei Shtylyov Acked-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/sh_eth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sh_eth.h b/include/linux/sh_eth.h index 8c9131db2b25..f2e27e078362 100644 --- a/include/linux/sh_eth.h +++ b/include/linux/sh_eth.h @@ -4,7 +4,7 @@ #include #include -enum {EDMAC_LITTLE_ENDIAN, EDMAC_BIG_ENDIAN}; +enum {EDMAC_LITTLE_ENDIAN}; struct sh_eth_plat_data { int phy; -- cgit v1.2.3 From 538950a1b7527a0a52ccd9337e3fcd304f027f13 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 4 Jan 2016 17:41:47 -0500 Subject: soreuseport: setsockopt SO_ATTACH_REUSEPORT_[CE]BPF Expose socket options for setting a classic or extended BPF program for use when selecting sockets in an SO_REUSEPORT group. These options can be used on the first socket to belong to a group before bind or on any socket in the group after bind. This change includes refactoring of the existing sk_filter code to allow reuse of the existing BPF filter validation checks. Signed-off-by: Craig Gallek Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4165e9ac9e36..294c3cdf07b3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -447,6 +447,8 @@ void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); +int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); +int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); -- cgit v1.2.3 From b0844444590e18704644f707ea88bff1b976b0e7 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Tue, 29 Dec 2015 14:58:30 +0200 Subject: net/mlx5_core: Introduce access function to read internal timer A preparation step which adds support for reading the hardware internal timer and the hardware timestamping from the CQE. In addition, advertize device_frequency_khz HCA capability. Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 20 +++++++++++++++++--- include/linux/mlx5/mlx5_ifc.h | 6 +++--- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 7d3a85faefb7..df2f79ef3cac 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -443,9 +443,12 @@ struct mlx5_init_seg { __be32 rsvd1[120]; __be32 initializing; struct health_buffer health; - __be32 rsvd2[884]; + __be32 rsvd2[880]; + __be32 internal_timer_h; + __be32 internal_timer_l; + __be32 rsrv3[2]; __be32 health_counter; - __be32 rsvd3[1019]; + __be32 rsvd4[1019]; __be64 ieee1588_clk; __be32 ieee1588_clk_type; __be32 clr_intx; @@ -601,7 +604,8 @@ struct mlx5_cqe64 { __be32 imm_inval_pkey; u8 rsvd40[4]; __be32 byte_cnt; - __be64 timestamp; + __be32 timestamp_h; + __be32 timestamp_l; __be32 sop_drop_qpn; __be16 wqe_counter; u8 signature; @@ -623,6 +627,16 @@ static inline int cqe_has_vlan(struct mlx5_cqe64 *cqe) return !!(cqe->l4_hdr_type_etc & 0x1); } +static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe) +{ + u32 hi, lo; + + hi = be32_to_cpu(cqe->timestamp_h); + lo = be32_to_cpu(cqe->timestamp_l); + + return (u64)lo | ((u64)hi << 32); +} + enum { CQE_L4_HDR_TYPE_NONE = 0x0, CQE_L4_HDR_TYPE_TCP_NO_ACK = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 131a2737cfa3..1780a85a8797 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -829,9 +829,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_66[0x8]; u8 log_uar_page_sz[0x10]; - u8 reserved_67[0xe0]; - - u8 reserved_68[0x1f]; + u8 reserved_67[0x40]; + u8 device_frequency_khz[0x20]; + u8 reserved_68[0x5f]; u8 cqe_zip[0x1]; u8 cqe_zip_timeout[0x10]; -- cgit v1.2.3 From cdba756f5803a2f0a8bbc6605acc166dd817979e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Jan 2016 06:53:50 -0800 Subject: net: move ndo_features_check() close to ndo_start_xmit() TX fast path uses ndo_start_xmit(), ndo_features_check() and ndo_select_queue(). Move ndo_features_check() close to ndo_start_xmit() to increase data locality. All "struct net_device_ops" should now be using C99 initializers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c20b814e46a0..8d8e5ca951b4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -812,6 +812,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX) * Required can not be NULL. * + * netdev_features_t (*ndo_fix_features)(struct net_device *dev, + * netdev_features_t features); + * Adjusts the requested feature flags according to device-specific + * constraints, and returns the resulting flags. Must not modify + * the device state. + * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, * void *accel_priv, select_queue_fallback_t fallback); * Called to decide which queue to when device supports multiple @@ -959,12 +965,6 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * Called to release previously enslaved netdev. * * Feature/offload setting functions. - * netdev_features_t (*ndo_fix_features)(struct net_device *dev, - * netdev_features_t features); - * Adjusts the requested feature flags according to device-specific - * constraints, and returns the resulting flags. Must not modify - * the device state. - * * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features); * Called to update device configuration to new features. Passed * feature set might be less than what was returned by ndo_fix_features()). @@ -1081,8 +1081,11 @@ struct net_device_ops { void (*ndo_uninit)(struct net_device *dev); int (*ndo_open)(struct net_device *dev); int (*ndo_stop)(struct net_device *dev); - netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb, - struct net_device *dev); + netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb, + struct net_device *dev); + netdev_features_t (*ndo_features_check)(struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, void *accel_priv, @@ -1245,9 +1248,6 @@ struct net_device_ops { struct net_device *dev, void *priv); int (*ndo_get_lock_subclass)(struct net_device *dev); - netdev_features_t (*ndo_features_check) (struct sk_buff *skb, - struct net_device *dev, - netdev_features_t features); int (*ndo_set_tx_maxrate)(struct net_device *dev, int queue_index, u32 maxrate); -- cgit v1.2.3 From c7f5d105495a38ed09e70d825f75d9d7d5407264 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 5 Nov 2015 11:34:57 -0500 Subject: net: Add eth_platform_get_mac_address() helper. A repeating pattern in drivers has become to use OF node information and, if not found, platform specific host information to extract the ethernet address for a given device. Currently this is done with a call to of_get_mac_address() and then some ifdef'd stuff for SPARC. Consolidate this into a portable routine, and provide the arch_get_platform_mac_address() weak function hook for all architectures to implement if they want. Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index eb049c622208..37ff4a6faa9a 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -29,6 +29,9 @@ #include #ifdef __KERNEL__ +struct device; +int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); +unsigned char *arch_get_platform_get_mac_address(void); u32 eth_get_headlen(void *data, unsigned int max_len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; -- cgit v1.2.3 From ccaa953e9fc7ebb90fba4e4815966683bef4866f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:06 +0100 Subject: phy: Consistently use addr for address on an MII bus Within phy.h, an address on an MII bus has been called both addr and phy_id. phy_id is particularly confusion, since it also means the ID found in register 3, if the device on the bus is a phy. Consistently use addr. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index a89cb0eef911..77b5e56e2a92 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -158,8 +158,8 @@ struct mii_bus { const char *name; char id[MII_BUS_ID_SIZE]; void *priv; - int (*read)(struct mii_bus *bus, int phy_id, int regnum); - int (*write)(struct mii_bus *bus, int phy_id, int regnum, u16 val); + int (*read)(struct mii_bus *bus, int addr, int regnum); + int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); int (*reset)(struct mii_bus *bus); /* -- cgit v1.2.3 From bac83c653799d7ea3f6cc4d7396d75adc5e0f778 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:07 +0100 Subject: mdio: Move mdiobus_read/write operatings into mdio.h These are logically MDIO operations, not phy operations, so move them into the mdio header. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 6 ++++++ include/linux/phy.h | 6 +----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index b42963bc81dd..0d073c23c10d 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -11,6 +11,7 @@ #include +struct mii_bus; static inline bool mdio_phy_id_is_c45(int phy_id) { @@ -173,4 +174,9 @@ static inline u16 ethtool_adv_to_mmd_eee_adv_t(u32 adv) return reg; } +int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum); +int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum); +int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); +int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val); + #endif /* __LINUX_MDIO_H__ */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 77b5e56e2a92..8ca161a37e8a 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -212,11 +213,6 @@ static inline struct mii_bus *devm_mdiobus_alloc(struct device *dev) void devm_mdiobus_free(struct device *dev, struct mii_bus *bus); struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr); -int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum); -int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum); -int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); -int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val); - #define PHY_INTERRUPT_DISABLED 0x0 #define PHY_INTERRUPT_ENABLED 0x80000000 -- cgit v1.2.3 From 72ba48be3ec8e70937ad97d4420ef7144617c64b Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:09 +0100 Subject: phy: Add phydev_err() and phydev_dbg() macros In preparation for moving some of the phy_device structure members, add macros for printing errors and debug information. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 8ca161a37e8a..dbcf9fdd960c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -777,6 +777,12 @@ static inline int phy_read_status(struct phy_device *phydev) return phydev->drv->read_status(phydev); } +#define phydev_err(_phydev, format, args...) \ + dev_err(&_phydev->dev, format, ##args) + +#define phydev_dbg(_phydev, format, args...) \ + dev_dbg(&_phydev->dev, format, ##args) + int genphy_config_init(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); -- cgit v1.2.3 From 84eff6d194df442bee62c129f2f47efb0dbd0468 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:10 +0100 Subject: phy: add phydev_name() wrapper Add a phydev_name() function, to help with moving some structure members from phy_device. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index dbcf9fdd960c..5f5cc3424b9e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -783,6 +783,11 @@ static inline int phy_read_status(struct phy_device *phydev) #define phydev_dbg(_phydev, format, args...) \ dev_dbg(&_phydev->dev, format, ##args) +static inline const char *phydev_name(const struct phy_device *phydev) +{ + return dev_name(&phydev->dev); +} + int genphy_config_init(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); -- cgit v1.2.3 From 053e7e169229adebbc27fc176c5369398e9f5eba Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:12 +0100 Subject: phy: phy_{read|write}_mmd_indirect: get addr from phydev The address of the device can be determined from the phydev structure, rather than passing it as a parameter. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 5f5cc3424b9e..08198ce98773 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -629,14 +629,12 @@ static inline int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum) * phy_read_mmd_indirect - reads data from the MMD registers * @phydev: The PHY device bus * @prtad: MMD Address - * @devad: MMD DEVAD * @addr: PHY address on the MII bus * * Description: it reads data from the MMD registers (clause 22 to access to * clause 45) of the specified phy address. */ -int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, - int devad, int addr); +int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, int devad); /** * phy_read - Convenience function for reading a given PHY register @@ -735,14 +733,13 @@ static inline int phy_write_mmd(struct phy_device *phydev, int devad, * @phydev: The PHY device * @prtad: MMD Address * @devad: MMD DEVAD - * @addr: PHY address on the MII bus * @data: data to write in the MMD register * * Description: Write data from the MMD registers of the specified * phy address. */ void phy_write_mmd_indirect(struct phy_device *phydev, int prtad, - int devad, int addr, u32 data); + int devad, u32 data); struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id, bool is_c45, -- cgit v1.2.3 From 2220943a21e26d97d7fd8f83c004b947326b469d Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:13 +0100 Subject: phy: Centralise print about attached phy Many Ethernet drivers contain the same netdev_info() print statement about the attached phy. Move it into the phy device code. Additionally add a varargs function which can be used to append additional information. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 08198ce98773..ecbf6382ba29 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -16,6 +16,7 @@ #ifndef __PHY_H #define __PHY_H +#include #include #include #include @@ -785,6 +786,9 @@ static inline const char *phydev_name(const struct phy_device *phydev) return dev_name(&phydev->dev); } +void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) + __printf(2, 3); +void phy_attached_info(struct phy_device *phydev); int genphy_config_init(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); -- cgit v1.2.3 From e7f4dc3536a40097f95103ddf98dd55b3a980f5b Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:15 +0100 Subject: mdio: Move allocation of interrupts into core Have mdio_alloc() create the array of interrupt numbers, and initialize it to POLLING. This is what most MDIO drivers want, so allowing code to be removed from the drivers. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index ecbf6382ba29..a5473c9e19de 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -189,10 +189,10 @@ struct mii_bus { u32 phy_ignore_ta_mask; /* - * Pointer to an array of interrupts, each PHY's - * interrupt at the index matching its address + * An array of interrupts, each PHY's interrupt at the index + * matching its address */ - int *irq; + int irq[PHY_MAX_ADDR]; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) -- cgit v1.2.3 From e5a03bfd873c29eb786655ef2e95e53ed242b404 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:16 +0100 Subject: phy: Add an mdio_device structure Not all devices attached to an MDIO bus are phys. So add an mdio_device structure to represent the generic parts of an mdio device, and place this structure into the phy_device. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio.h | 9 +++++++++ include/linux/phy.h | 26 +++++++++++--------------- 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 0d073c23c10d..94f9f1491cde 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -13,6 +13,15 @@ struct mii_bus; +struct mdio_device { + struct device dev; + + struct mii_bus *bus; + /* Bus address of the MDIO device (0-31) */ + int addr; +}; +#define to_mdio_device(d) container_of(d, struct mdio_device, dev) + static inline bool mdio_phy_id_is_c45(int phy_id) { return (phy_id & MDIO_PHY_ID_C45) && !(phy_id & ~MDIO_PHY_ID_C45_MASK); diff --git a/include/linux/phy.h b/include/linux/phy.h index a5473c9e19de..239a0c2bc49d 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -358,14 +358,12 @@ struct phy_c45_device_ids { * handling, as well as handling shifts in PHY hardware state */ struct phy_device { + struct mdio_device mdio; + /* Information about the PHY type */ /* And management functions */ struct phy_driver *drv; - struct mii_bus *bus; - - struct device dev; - u32 phy_id; struct phy_c45_device_ids c45_ids; @@ -381,9 +379,6 @@ struct phy_device { phy_interface_t interface; - /* Bus address of the PHY (0-31) */ - int addr; - /* * forced speed & duplex (no autoneg) * partner speed & duplex & pause (autoneg) @@ -432,7 +427,8 @@ struct phy_device { void (*adjust_link)(struct net_device *dev); }; -#define to_phy_device(d) container_of(d, struct phy_device, dev) +#define to_phy_device(d) container_of(to_mdio_device(d), \ + struct phy_device, mdio) /* struct phy_driver: Driver structure for a particular PHY type * @@ -622,7 +618,7 @@ static inline int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum) if (!phydev->is_c45) return -EOPNOTSUPP; - return mdiobus_read(phydev->bus, phydev->addr, + return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff)); } @@ -648,7 +644,7 @@ int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, int devad); */ static inline int phy_read(struct phy_device *phydev, u32 regnum) { - return mdiobus_read(phydev->bus, phydev->addr, regnum); + return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum); } /** @@ -663,7 +659,7 @@ static inline int phy_read(struct phy_device *phydev, u32 regnum) */ static inline int phy_write(struct phy_device *phydev, u32 regnum, u16 val) { - return mdiobus_write(phydev->bus, phydev->addr, regnum, val); + return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** @@ -726,7 +722,7 @@ static inline int phy_write_mmd(struct phy_device *phydev, int devad, regnum = MII_ADDR_C45 | ((devad & 0x1f) << 16) | (regnum & 0xffff); - return mdiobus_write(phydev->bus, phydev->addr, regnum, val); + return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** @@ -776,14 +772,14 @@ static inline int phy_read_status(struct phy_device *phydev) } #define phydev_err(_phydev, format, args...) \ - dev_err(&_phydev->dev, format, ##args) + dev_err(&_phydev->mdio.dev, format, ##args) #define phydev_dbg(_phydev, format, args...) \ - dev_dbg(&_phydev->dev, format, ##args) + dev_dbg(&_phydev->mdio.dev, format, ##args); static inline const char *phydev_name(const struct phy_device *phydev) { - return dev_name(&phydev->dev); + return dev_name(&phydev->mdio.dev); } void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) -- cgit v1.2.3 From 7f854420fbfe9d49afe2ffb1df052cfe8e215541 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:18 +0100 Subject: phy: Add API for {un}registering an mdio device to a bus. Rather than have drivers directly manipulate the mii_bus structure, provide and API for registering and unregistering devices on an MDIO bus, and performing lookups. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 8 ++++++++ include/linux/phy.h | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 94f9f1491cde..8cd9579e18ea 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -19,9 +19,12 @@ struct mdio_device { struct mii_bus *bus; /* Bus address of the MDIO device (0-31) */ int addr; + int flags; }; #define to_mdio_device(d) container_of(d, struct mdio_device, dev) +#define MDIO_DEVICE_FLAG_PHY 1 + static inline bool mdio_phy_id_is_c45(int phy_id) { return (phy_id & MDIO_PHY_ID_C45) && !(phy_id & ~MDIO_PHY_ID_C45_MASK); @@ -188,4 +191,9 @@ int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum); int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val); +int mdiobus_register_device(struct mdio_device *mdiodev); +int mdiobus_unregister_device(struct mdio_device *mdiodev); +bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); +struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); + #endif /* __LINUX_MDIO_H__ */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 239a0c2bc49d..2d7beef20825 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -180,7 +180,7 @@ struct mii_bus { struct device dev; /* list of all PHYs on bus */ - struct phy_device *phy_map[PHY_MAX_ADDR]; + struct mdio_device *mdio_map[PHY_MAX_ADDR]; /* PHY addresses to be ignored when probing */ u32 phy_mask; -- cgit v1.2.3 From bc87922ff59d364a33e9bce0febdef21a7fbd2af Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:21 +0100 Subject: phy: Move PHY PM operations into phy_device The MDIO PM operations are really PHY device PM operations. So move them into phy_device. This will be needed when we support devices on the mdio bus which are not PHYs. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 8cd9579e18ea..9f844d372ed5 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -15,7 +15,7 @@ struct mii_bus; struct mdio_device { struct device dev; - + const struct dev_pm_ops *pm_ops; struct mii_bus *bus; /* Bus address of the MDIO device (0-31) */ int addr; -- cgit v1.2.3 From be01da72b1b832b89fbdf59ae6f1b60e53ca2987 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:22 +0100 Subject: phy: Centralize setting driver module owner Rather than have each driver set the driver owner field, do it once in the core code. This will also help with later changes, when the device structure will move. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 2d7beef20825..49e4418822b3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -797,8 +797,9 @@ int genphy_resume(struct phy_device *phydev); int genphy_soft_reset(struct phy_device *phydev); void phy_driver_unregister(struct phy_driver *drv); void phy_drivers_unregister(struct phy_driver *drv, int n); -int phy_driver_register(struct phy_driver *new_driver); -int phy_drivers_register(struct phy_driver *new_driver, int n); +int phy_driver_register(struct phy_driver *new_driver, struct module *owner); +int phy_drivers_register(struct phy_driver *new_driver, int n, + struct module *owner); void phy_state_machine(struct work_struct *work); void phy_change(struct work_struct *work); void phy_mac_interrupt(struct phy_device *phydev, int new_link); @@ -843,7 +844,7 @@ extern struct bus_type mdio_bus_type; #define phy_module_driver(__phy_drivers, __count) \ static int __init phy_module_init(void) \ { \ - return phy_drivers_register(__phy_drivers, __count); \ + return phy_drivers_register(__phy_drivers, __count, THIS_MODULE); \ } \ module_init(phy_module_init); \ static void __exit phy_module_exit(void) \ -- cgit v1.2.3 From e76a4957c5ee5cf69cea89d450c29c536e77ce9e Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:23 +0100 Subject: phy: Move phy specific bus match into phy_device Matching a driver to a device has both generic parts, and parts which are specific to PHY devices. Move the PHY specific parts into phy_device. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 9f844d372ed5..0690359e55a5 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -17,6 +17,7 @@ struct mdio_device { struct device dev; const struct dev_pm_ops *pm_ops; struct mii_bus *bus; + int (*bus_match)(struct device *dev, struct device_driver *drv); /* Bus address of the MDIO device (0-31) */ int addr; int flags; -- cgit v1.2.3 From a9049e0c513c4521dbfaa302af8ed08b3366b41f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:26 +0100 Subject: mdio: Add support for mdio drivers. Not all devices on an MDIO bus are PHYs. Meaning not all MDIO drivers are PHY drivers. Add support for generic MDIO drivers. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/phy.h | 9 +++++---- 2 files changed, 55 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 0690359e55a5..75f7fad0af4f 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -15,6 +15,7 @@ struct mii_bus; struct mdio_device { struct device dev; + const struct dev_pm_ops *pm_ops; struct mii_bus *bus; int (*bus_match)(struct device *dev, struct device_driver *drv); @@ -24,7 +25,37 @@ struct mdio_device { }; #define to_mdio_device(d) container_of(d, struct mdio_device, dev) +/* struct mdio_driver_common: Common to all MDIO drivers */ +struct mdio_driver_common { + struct device_driver driver; + int flags; +}; #define MDIO_DEVICE_FLAG_PHY 1 +#define to_mdio_common_driver(d) \ + container_of(d, struct mdio_driver_common, driver) + +/* struct mdio_driver: Generic MDIO driver */ +struct mdio_driver { + struct mdio_driver_common mdiodrv; + + /* + * Called during discovery. Used to set + * up device-specific structures, if any + */ + int (*probe)(struct mdio_device *mdiodev); + + /* Clears up any memory if needed */ + void (*remove)(struct mdio_device *mdiodev); +}; +#define to_mdio_driver(d) \ + container_of(to_mdio_common_driver(d), struct mdio_driver, mdiodrv) + +void mdio_device_free(struct mdio_device *mdiodev); +struct mdio_device *mdio_device_create(struct mii_bus *bus, int addr); +int mdio_device_register(struct mdio_device *mdiodev); +void mdio_device_remove(struct mdio_device *mdiodev); +int mdio_driver_register(struct mdio_driver *drv); +void mdio_driver_unregister(struct mdio_driver *drv); static inline bool mdio_phy_id_is_c45(int phy_id) { @@ -197,4 +228,23 @@ int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); +/** + * module_mdio_driver() - Helper macro for registering mdio drivers + * + * Helper macro for MDIO drivers which do not do anything special in module + * init/exit. Each module may only use this macro once, and calling it + * replaces module_init() and module_exit(). + */ +#define mdio_module_driver(_mdio_driver) \ +static int __init mdio_module_init(void) \ +{ \ + return mdio_driver_register(&_mdio_driver); \ +} \ +module_init(mdio_module_init); \ +static void __exit mdio_module_exit(void) \ +{ \ + mdio_driver_unregister(&_mdio_driver); \ +} \ +module_exit(mdio_module_exit) + #endif /* __LINUX_MDIO_H__ */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 49e4418822b3..d6f3641e7933 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -60,6 +60,7 @@ #define PHY_HAS_INTERRUPT 0x00000001 #define PHY_HAS_MAGICANEG 0x00000002 #define PHY_IS_INTERNAL 0x00000004 +#define MDIO_DEVICE_IS_PHY 0x80000000 /* Interface Mode definitions */ typedef enum { @@ -432,6 +433,7 @@ struct phy_device { /* struct phy_driver: Driver structure for a particular PHY type * + * driver_data: static driver data * phy_id: The result of reading the UID registers of this PHY * type, and ANDing them with the phy_id_mask. This driver * only works for PHYs with IDs which match this field @@ -441,7 +443,6 @@ struct phy_device { * by this PHY * flags: A bitfield defining certain other features this PHY * supports (like interrupts) - * driver_data: static driver data * * The drivers must implement config_aneg and read_status. All * other functions are optional. Note that none of these @@ -452,6 +453,7 @@ struct phy_device { * supported in the driver). */ struct phy_driver { + struct mdio_driver_common mdiodrv; u32 phy_id; char *name; unsigned int phy_id_mask; @@ -587,10 +589,9 @@ struct phy_driver { void (*get_strings)(struct phy_device *dev, u8 *data); void (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); - - struct device_driver driver; }; -#define to_phy_driver(d) container_of(d, struct phy_driver, driver) +#define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ + struct phy_driver, mdiodrv) #define PHY_ANY_ID "MATCH ANY PHY" #define PHY_ANY_UID 0xffffffff -- cgit v1.2.3 From 711fdba37a3dd7ee487e28767f9f0e67144cbf80 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 6 Jan 2016 20:11:27 +0100 Subject: mdio: Abstract device_remove() and device_free() Make device_free and device_remove operations in the mdio device structure, so the core code does not need to differentiate between phy devices and generic mdio devices. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mdio.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 75f7fad0af4f..5bfd99d1a40a 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -18,7 +18,11 @@ struct mdio_device { const struct dev_pm_ops *pm_ops; struct mii_bus *bus; + int (*bus_match)(struct device *dev, struct device_driver *drv); + void (*device_free)(struct mdio_device *mdiodev); + void (*device_remove)(struct mdio_device *mdiodev); + /* Bus address of the MDIO device (0-31) */ int addr; int flags; -- cgit v1.2.3 From 01dd194c387af5b3c4c1f6459d30f596565e466c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 6 Jan 2016 22:32:16 +0100 Subject: bpf: cleanup bpf_prog_run_{save,clear}_cb helpers Move the details behind the cb[] access into a small helper to decouple and make them generic for bpf_prog_run_save_cb()/bpf_prog_run_clear_cb() that was introduced via commit ff936a04e5f2 ("bpf: fix cb access in socket filter programs"). Also add a comment to better clarify what is done in bpf_skb_cb(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index f5b5891ed1ba..43aa1f8855c7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -350,25 +350,43 @@ struct sk_filter { #define BPF_PROG_RUN(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi) +#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN + +static inline u8 *bpf_skb_cb(struct sk_buff *skb) +{ + /* eBPF programs may read/write skb->cb[] area to transfer meta + * data between tail calls. Since this also needs to work with + * tc, that scratch memory is mapped to qdisc_skb_cb's data area. + * + * In some socket filter cases, the cb unfortunately needs to be + * saved/restored so that protocol specific skb->cb[] data won't + * be lost. In any case, due to unpriviledged eBPF programs + * attached to sockets, we need to clear the bpf_skb_cb() area + * to not leak previous contents to user space. + */ + BUILD_BUG_ON(FIELD_SIZEOF(struct __sk_buff, cb) != BPF_SKB_CB_LEN); + BUILD_BUG_ON(FIELD_SIZEOF(struct __sk_buff, cb) != + FIELD_SIZEOF(struct qdisc_skb_cb, data)); + + return qdisc_skb_cb(skb)->data; +} + static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, struct sk_buff *skb) { - u8 *cb_data = qdisc_skb_cb(skb)->data; - u8 saved_cb[QDISC_CB_PRIV_LEN]; + u8 *cb_data = bpf_skb_cb(skb); + u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; - BUILD_BUG_ON(FIELD_SIZEOF(struct __sk_buff, cb) != - QDISC_CB_PRIV_LEN); - if (unlikely(prog->cb_access)) { - memcpy(saved_cb, cb_data, sizeof(saved_cb)); - memset(cb_data, 0, sizeof(saved_cb)); + memcpy(cb_saved, cb_data, sizeof(cb_saved)); + memset(cb_data, 0, sizeof(cb_saved)); } res = BPF_PROG_RUN(prog, skb); if (unlikely(prog->cb_access)) - memcpy(cb_data, saved_cb, sizeof(saved_cb)); + memcpy(cb_data, cb_saved, sizeof(cb_saved)); return res; } @@ -376,10 +394,11 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, struct sk_buff *skb) { - u8 *cb_data = qdisc_skb_cb(skb)->data; + u8 *cb_data = bpf_skb_cb(skb); if (unlikely(prog->cb_access)) - memset(cb_data, 0, QDISC_CB_PRIV_LEN); + memset(cb_data, 0, BPF_SKB_CB_LEN); + return BPF_PROG_RUN(prog, skb); } -- cgit v1.2.3 From f8ffad69c9f8b8dfb0b633425d4ef4d2493ba61a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 7 Jan 2016 15:50:23 +0100 Subject: bpf: add skb_postpush_rcsum and fix dev_forward_skb occasions Add a small helper skb_postpush_rcsum() and fix up redirect locations that need CHECKSUM_COMPLETE fixups on ingress. dev_forward_skb() expects a proper csum that covers also Ethernet header, f.e. since 2c26d34bbcc0 ("net/core: Handle csum for CHECKSUM_COMPLETE VXLAN forwarding"), we also do skb_postpull_rcsum() after pulling Ethernet header off via eth_type_trans(). When using eBPF in a netns setup f.e. with vxlan in collect metadata mode, I can trigger the following csum issue with an IPv6 setup: [ 505.144065] dummy1: hw csum failure [...] [ 505.144108] Call Trace: [ 505.144112] [] dump_stack+0x44/0x5c [ 505.144134] [] netdev_rx_csum_fault+0x3a/0x40 [ 505.144142] [] __skb_checksum_complete+0xcf/0xe0 [ 505.144149] [] nf_ip6_checksum+0xb2/0x120 [ 505.144161] [] icmpv6_error+0x17e/0x328 [nf_conntrack_ipv6] [ 505.144170] [] ? ip6t_do_table+0x2fa/0x645 [ip6_tables] [ 505.144177] [] ? ipv6_get_l4proto+0x65/0xd0 [nf_conntrack_ipv6] [ 505.144189] [] nf_conntrack_in+0xc2/0x5a0 [nf_conntrack] [ 505.144196] [] ipv6_conntrack_in+0x1c/0x20 [nf_conntrack_ipv6] [ 505.144204] [] nf_iterate+0x5d/0x70 [ 505.144210] [] nf_hook_slow+0x66/0xc0 [ 505.144218] [] ipv6_rcv+0x3f2/0x4f0 [ 505.144225] [] ? ip6_make_skb+0x1b0/0x1b0 [ 505.144232] [] __netif_receive_skb_core+0x36b/0x9a0 [ 505.144239] [] ? __netif_receive_skb+0x18/0x60 [ 505.144245] [] __netif_receive_skb+0x18/0x60 [ 505.144252] [] process_backlog+0x9f/0x140 [ 505.144259] [] net_rx_action+0x145/0x320 [...] What happens is that on ingress, we push Ethernet header back in, either from cls_bpf or right before skb_do_redirect(), but without updating csum. The "hw csum failure" can be fixed by using the new skb_postpush_rcsum() helper for the dev_forward_skb() case to correct the csum diff again. Thanks to Hannes Frederic Sowa for the csum_partial() idea! Fixes: 3896d655f4d4 ("bpf: introduce bpf_clone_redirect() helper") Fixes: 27b29f63058d ("bpf: add bpf_redirect() helper") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6b6bd42d6134..07f9ccd28654 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2805,6 +2805,23 @@ static inline void skb_postpull_rcsum(struct sk_buff *skb, unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); +static inline void skb_postpush_rcsum(struct sk_buff *skb, + const void *start, unsigned int len) +{ + /* For performing the reverse operation to skb_postpull_rcsum(), + * we can instead of ... + * + * skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); + * + * ... just use this equivalent version here to save a few + * instructions. Feeding csum of 0 in csum_partial() and later + * on adding skb->csum is equivalent to feed skb->csum in the + * first place. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_partial(start, len, skb->csum); +} + /** * pskb_trim_rcsum - trim received skb and update checksum * @skb: buffer to trim -- cgit v1.2.3 From 1f211a1b929c804100e138c5d3d656992cfd5622 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 7 Jan 2016 22:29:47 +0100 Subject: net, sched: add clsact qdisc This work adds a generalization of the ingress qdisc as a qdisc holding only classifiers. The clsact qdisc works on ingress, but also on egress. In both cases, it's execution happens without taking the qdisc lock, and the main difference for the egress part compared to prior version of [1] is that this can be applied with _any_ underlying real egress qdisc (also classless ones). Besides solving the use-case of [1], that is, allowing for more programmability on assigning skb->priority for the mqprio case that is supported by most popular 10G+ NICs, it also opens up a lot more flexibility for other tc applications. The main work on classification can already be done at clsact egress time if the use-case allows and state stored for later retrieval f.e. again in skb->priority with major/minors (which is checked by most classful qdiscs before consulting tc_classify()) and/or in other skb fields like skb->tc_index for some light-weight post-processing to get to the eventual classid in case of a classful qdisc. Another use case is that the clsact egress part allows to have a central egress counterpart to the ingress classifiers, so that classifiers can easily share state (e.g. in cls_bpf via eBPF maps) for ingress and egress. Currently, default setups like mq + pfifo_fast would require for this to use, for example, prio qdisc instead (to get a tc_classify() run) and to duplicate the egress classifier for each queue. With clsact, it allows for leaving the setup as is, it can additionally assign skb->priority to put the skb in one of pfifo_fast's bands and it can share state with maps. Moreover, we can access the skb's dst entry (f.e. to retrieve tclassid) w/o the need to perform a skb_dst_force() to hold on to it any longer. In lwt case, we can also use this facility to setup dst metadata via cls_bpf (bpf_skb_set_tunnel_key()) without needing a real egress qdisc just for that (case of IFF_NO_QUEUE devices, for example). The realization can be done without any changes to the scheduler core framework. All it takes is that we have two a-priori defined minors/child classes, where we can mux between ingress and egress classifier list (dev->ingress_cl_list and dev->egress_cl_list, latter stored close to dev->_tx to avoid extra cacheline miss for moderate loads). The egress part is a bit similar modelled to handle_ing() and patched to a noop in case the functionality is not used. Both handlers are now called sch_handle_ingress() and sch_handle_egress(), code sharing among the two doesn't seem practical as there are various minor differences in both paths, so that making them conditional in a single handler would rather slow things down. Full compatibility to ingress qdisc is provided as well. Since both piggyback on TC_H_CLSACT, only one of them (ingress/clsact) can exist per netdevice, and thus ingress qdisc specific behaviour can be retained for user space. This means, either a user does 'tc qdisc add dev foo ingress' and configures ingress qdisc as usual, or the 'tc qdisc add dev foo clsact' alternative, where both, ingress and egress classifier can be configured as in the below example. ingress qdisc supports attaching classifier to any minor number whereas clsact has two fixed minors for muxing between the lists, therefore to not break user space setups, they are better done as two separate qdiscs. I decided to extend the sch_ingress module with clsact functionality so that commonly used code can be reused, the module is being aliased with sch_clsact so that it can be auto-loaded properly. Alternative would have been to add a flag when initializing ingress to alter its behaviour plus aliasing to a different name (as it's more than just ingress). However, the first would end up, based on the flag, choosing the new/old behaviour by calling different function implementations to handle each anyway, the latter would require to register ingress qdisc once again under different alias. So, this really begs to provide a minimal, cleaner approach to have Qdisc_ops and Qdisc_class_ops by its own that share callbacks used by both. Example, adding qdisc: # tc qdisc add dev foo clsact # tc qdisc show dev foo qdisc mq 0: root qdisc pfifo_fast 0: parent :1 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :2 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :3 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :4 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc clsact ffff: parent ffff:fff1 Adding filters (deleting, etc works analogous by specifying ingress/egress): # tc filter add dev foo ingress bpf da obj bar.o sec ingress # tc filter add dev foo egress bpf da obj bar.o sec egress # tc filter show dev foo ingress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[ingress] direct-action # tc filter show dev foo egress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[egress] direct-action A 'tc filter show dev foo' or 'tc filter show dev foo parent ffff:' will show an empty list for clsact. Either using the parent names (ingress/egress) or specifying the full major/minor will then show the related filter lists. Prior work on a mqprio prequeue() facility [1] was done mainly by John Fastabend. [1] http://patchwork.ozlabs.org/patch/512949/ Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- include/linux/rtnetlink.h | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8d8e5ca951b4..2285596e7045 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1739,7 +1739,9 @@ struct net_device { #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_maps; #endif - +#ifdef CONFIG_NET_CLS_ACT + struct tcf_proto __rcu *egress_cl_list; +#endif #ifdef CONFIG_NET_SWITCHDEV u32 offload_fwd_mark; #endif diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 4be5048b1fbe..c006cc900c44 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -84,6 +84,11 @@ void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); #endif +#ifdef CONFIG_NET_EGRESS +void net_inc_egress_queue(void); +void net_dec_egress_queue(void); +#endif + extern void rtnetlink_init(void); extern void __rtnl_unlock(void); -- cgit v1.2.3 From 712f4aad406bb1ed67f3f98d04c044191f0ff593 Mon Sep 17 00:00:00 2001 From: willy tarreau Date: Sun, 10 Jan 2016 07:54:56 +0100 Subject: unix: properly account for FDs passed over unix sockets It is possible for a process to allocate and accumulate far more FDs than the process' limit by sending them over a unix socket then closing them to keep the process' fd count low. This change addresses this problem by keeping track of the number of FDs in flight per user and preventing non-privileged processes from having more FDs in flight than their configured FD limit. Reported-by: socketpair@gmail.com Reported-by: Tetsuo Handa Mitigates: CVE-2013-4312 (Linux 2.0+) Suggested-by: Linus Torvalds Acked-by: Hannes Frederic Sowa Signed-off-by: Willy Tarreau Signed-off-by: David S. Miller --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index edad7a43edea..fbf25f19b3b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -830,6 +830,7 @@ struct user_struct { unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ #endif unsigned long locked_shm; /* How many pages of mlocked shm ? */ + unsigned long unix_inflight; /* How many files in flight in unix sockets */ #ifdef CONFIG_KEYS struct key *uid_keyring; /* UID specific keyring */ -- cgit v1.2.3 From f0d22d1874730530a2ac304fd0888cb8a6864527 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:25:57 +0200 Subject: net/mlx5_core: Introduce flow steering autogrouped flow table When user add rule to autogrouped flow table, we search for flow group with the same match criteria, if we don't find such group then we create new flow group with the required match criteria and insert the rule to this group. We divide the flow table into required_groups + 1, in order to reserve a part of the flow table for rules which don't match any existing group. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index bc7ad019afde..06ac6e8fccfa 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -61,6 +61,12 @@ struct mlx5_flow_namespace * mlx5_get_flow_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type type); +struct mlx5_flow_table * +mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, + int prio, + int num_flow_table_entries, + int max_num_groups); + struct mlx5_flow_table * mlx5_create_flow_table(struct mlx5_flow_namespace *ns, int prio, -- cgit v1.2.3 From 2cc43b494a6c30ec0e554ea91ce763c97069e8cc Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:25:59 +0200 Subject: net/mlx5_core: Managing root flow table The root Flow Table for each Flow Table Type is defined, by default, as the Flow Table with level 0. In order not to use an empty flow tables and introduce new hops, but still preserve space for flow-tables that have a priority greater(lower number) than the current flow table, we introduce this new set root flow table command. This command tells the HW to start matching packets from the assigned root flow table. This command is used when we create new flow table with level lower than the current lowest flow table or it is the first flow table. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1780a85a8797..323e713c44ba 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -185,6 +185,7 @@ enum { MLX5_CMD_OP_MODIFY_RQT = 0x917, MLX5_CMD_OP_DESTROY_RQT = 0x918, MLX5_CMD_OP_QUERY_RQT = 0x919, + MLX5_CMD_OP_SET_FLOW_TABLE_ROOT = 0x92f, MLX5_CMD_OP_CREATE_FLOW_TABLE = 0x930, MLX5_CMD_OP_DESTROY_FLOW_TABLE = 0x931, MLX5_CMD_OP_QUERY_FLOW_TABLE = 0x932, @@ -258,7 +259,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 ft_support[0x1]; u8 reserved_0[0x2]; u8 flow_modify_en[0x1]; - u8 reserved_1[0x1c]; + u8 modify_root[0x1]; + u8 reserved_1[0x1b]; u8 reserved_2[0x2]; u8 log_max_ft_size[0x6]; @@ -6946,4 +6948,31 @@ union mlx5_ifc_uplink_pci_interface_document_bits { u8 reserved_0[0x20060]; }; +struct mlx5_ifc_set_flow_table_root_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_set_flow_table_root_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x40]; + + u8 table_type[0x8]; + u8 reserved_3[0x18]; + + u8 reserved_4[0x8]; + u8 table_id[0x18]; + + u8 reserved_5[0x140]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From 34a40e689393a6b13673ab395a9a4d063d249fe9 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:26:00 +0200 Subject: net/mlx5_core: Introduce modify flow table command Introduce the modify flow table command. This command is used when we want to change the next flow table of an existing flow table. The next flow table is defined as the table we search (in order to find a match), if we couldn't find a match in any of the flow table entries in the current flow table. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 56 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 323e713c44ba..7f166955d4c9 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -194,7 +194,8 @@ enum { MLX5_CMD_OP_QUERY_FLOW_GROUP = 0x935, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY = 0x936, MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY = 0x937, - MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY = 0x938 + MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY = 0x938, + MLX5_CMD_OP_MODIFY_FLOW_TABLE = 0x93c }; struct mlx5_ifc_flow_table_fields_supported_bits { @@ -260,7 +261,9 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reserved_0[0x2]; u8 flow_modify_en[0x1]; u8 modify_root[0x1]; - u8 reserved_1[0x1b]; + u8 identified_miss_table_mode[0x1]; + u8 flow_table_modify[0x1]; + u8 reserved_1[0x19]; u8 reserved_2[0x2]; u8 log_max_ft_size[0x6]; @@ -5669,12 +5672,16 @@ struct mlx5_ifc_create_flow_table_in_bits { u8 reserved_4[0x20]; - u8 reserved_5[0x8]; + u8 reserved_5[0x4]; + u8 table_miss_mode[0x4]; u8 level[0x8]; u8 reserved_6[0x8]; u8 log_size[0x8]; - u8 reserved_7[0x120]; + u8 reserved_7[0x8]; + u8 table_miss_id[0x18]; + + u8 reserved_8[0x100]; }; struct mlx5_ifc_create_flow_group_out_bits { @@ -6975,4 +6982,45 @@ struct mlx5_ifc_set_flow_table_root_in_bits { u8 reserved_5[0x140]; }; +enum { + MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID = 0x1, +}; + +struct mlx5_ifc_modify_flow_table_out_bits { + u8 status[0x8]; + u8 reserved_0[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_1[0x40]; +}; + +struct mlx5_ifc_modify_flow_table_in_bits { + u8 opcode[0x10]; + u8 reserved_0[0x10]; + + u8 reserved_1[0x10]; + u8 op_mod[0x10]; + + u8 reserved_2[0x20]; + + u8 reserved_3[0x10]; + u8 modify_field_select[0x10]; + + u8 table_type[0x8]; + u8 reserved_4[0x18]; + + u8 reserved_5[0x8]; + u8 table_id[0x18]; + + u8 reserved_6[0x4]; + u8 table_miss_mode[0x4]; + u8 reserved_7[0x18]; + + u8 reserved_8[0x8]; + u8 table_miss_id[0x18]; + + u8 reserved_9[0x100]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From 4cbdd30ed5c8bc5cf40813b025b4fb57b376a592 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:26:04 +0200 Subject: net/mlx5_core: Enable flow steering support for the IB driver When the driver is loaded, we create flow steering namespace for kernel bypass with nine priorities and another namespace for leftovers(in order to catch packets that weren't matched). Verbs applications will use these priorities. we found nine as a number that balances the requirements from the user and retains performance. The bypass namespace is used by verbs applications that want to bypass the kernel networking stack. The leftovers namespace is used by verbs applications and the sniffer in order to catch packets that weren't handled by any preceding rules. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 2 ++ include/linux/mlx5/fs.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index df2f79ef3cac..7be845e30689 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1258,4 +1258,6 @@ static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz; } +#define MLX5_BY_PASS_NUM_PRIOS 9 + #endif /* MLX5_DEVICE_H */ diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 06ac6e8fccfa..a94341271e3f 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -39,7 +39,9 @@ #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 enum mlx5_flow_namespace_type { + MLX5_FLOW_NAMESPACE_BYPASS, MLX5_FLOW_NAMESPACE_KERNEL, + MLX5_FLOW_NAMESPACE_LEFTOVERS, MLX5_FLOW_NAMESPACE_FDB, }; -- cgit v1.2.3 From b4d1f032d75b2efb73304e8c12faa7149ad700c7 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:26:05 +0200 Subject: net/mlx5_core: Make ipv4/ipv6 location more clear Change the mlx5 firmware interface header to make it more clear which bytes should be used by IPv4 or IPv6 addresses. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 7f166955d4c9..68d73f82e009 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -298,6 +298,22 @@ struct mlx5_ifc_odp_per_transport_service_cap_bits { u8 reserved_1[0x1a]; }; +struct mlx5_ifc_ipv4_layout_bits { + u8 reserved_0[0x60]; + + u8 ipv4[0x20]; +}; + +struct mlx5_ifc_ipv6_layout_bits { + u8 ipv6[16][0x8]; +}; + +union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits { + struct mlx5_ifc_ipv6_layout_bits ipv6_layout; + struct mlx5_ifc_ipv4_layout_bits ipv4_layout; + u8 reserved_0[0x80]; +}; + struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 smac_47_16[0x20]; @@ -328,9 +344,9 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 udp_sport[0x10]; u8 udp_dport[0x10]; - u8 src_ip[4][0x20]; + union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6; - u8 dst_ip[4][0x20]; + union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6; }; struct mlx5_ifc_fte_match_set_misc_bits { -- cgit v1.2.3 From 038d2ef87572757861a177b19f9d489def2c48b8 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 11 Jan 2016 10:26:07 +0200 Subject: IB/mlx5: Add flow steering support Adding flow steering support by creating a flow-table per priority (if rules exist in the priority). mlx5_ib uses autogrouping and thus only creates the required destinations. Also includes adding of these flow steering utilities 1. Parsing verbs flow attributes hardware steering specs. 2. Check if flow is multicast - this is required in order to decide to which flow table will we add the steering rule. 3. Set outer headers in flow match criteria to zeros. Signed-off-by: Maor Gottlieb Signed-off-by: Moni Shoua Signed-off-by: Matan Barak Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index a94341271e3f..8230caa3fb6e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -38,6 +38,16 @@ #define MLX5_FS_DEFAULT_FLOW_TAG 0x0 +#define LEFTOVERS_RULE_NUM 2 +static inline void build_leftovers_ft_param(int *priority, + int *n_ent, + int *n_grp) +{ + *priority = 0; /* Priority of leftovers_prio-0 */ + *n_ent = LEFTOVERS_RULE_NUM; + *n_grp = LEFTOVERS_RULE_NUM; +} + enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_BYPASS, MLX5_FLOW_NAMESPACE_KERNEL, -- cgit v1.2.3 From b6a0e72ad3cffabaf30b856deb58fbe64a0f36a8 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 11 Jan 2016 10:19:10 -0800 Subject: net: Fix typo in netdev_intersect_features Obviously need to 'or in NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM. Fixes: c8cd0989bd151f ("net: Eliminate NETIF_F_GEN_CSUM and NETIF_F_V[46]_CSUM") Reported-by: Jack Morgenstein Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2285596e7045..5ac140dcb789 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3873,9 +3873,9 @@ static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, { if ((f1 ^ f2) & NETIF_F_HW_CSUM) { if (f1 & NETIF_F_HW_CSUM) - f1 |= (NETIF_F_IP_CSUM|NETIF_F_IP_CSUM); + f1 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); else - f2 |= (NETIF_F_IP_CSUM|NETIF_F_IP_CSUM); + f2 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } return f1 & f2; -- cgit v1.2.3